aesni-intel_asm.S 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #include <asm/nospec-branch.h>
  34. /*
  35. * The following macros are used to move an (un)aligned 16 byte value to/from
  36. * an XMM register. This can done for either FP or integer values, for FP use
  37. * movaps (move aligned packed single) or integer use movdqa (move double quad
  38. * aligned). It doesn't make a performance difference which instruction is used
  39. * since Nehalem (original Core i7) was released. However, the movaps is a byte
  40. * shorter, so that is the one we'll use for now. (same for unaligned).
  41. */
  42. #define MOVADQ movaps
  43. #define MOVUDQ movups
  44. #ifdef __x86_64__
  45. .data
  46. .align 16
  47. .Lgf128mul_x_ble_mask:
  48. .octa 0x00000000000000010000000000000087
  49. POLY: .octa 0xC2000000000000000000000000000001
  50. TWOONE: .octa 0x00000001000000000000000000000001
  51. # order of these constants should not change.
  52. # more specifically, ALL_F should follow SHIFT_MASK,
  53. # and ZERO should follow ALL_F
  54. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  55. MASK1: .octa 0x0000000000000000ffffffffffffffff
  56. MASK2: .octa 0xffffffffffffffff0000000000000000
  57. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  58. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  59. ZERO: .octa 0x00000000000000000000000000000000
  60. ONE: .octa 0x00000000000000000000000000000001
  61. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  62. dec: .octa 0x1
  63. enc: .octa 0x2
  64. .text
  65. #define STACK_OFFSET 8*3
  66. #define HashKey 16*0 // store HashKey <<1 mod poly here
  67. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  68. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  69. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  70. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  71. // bits of HashKey <<1 mod poly here
  72. //(for Karatsuba purposes)
  73. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  74. // bits of HashKey^2 <<1 mod poly here
  75. // (for Karatsuba purposes)
  76. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  77. // bits of HashKey^3 <<1 mod poly here
  78. // (for Karatsuba purposes)
  79. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  80. // bits of HashKey^4 <<1 mod poly here
  81. // (for Karatsuba purposes)
  82. #define VARIABLE_OFFSET 16*8
  83. #define arg1 rdi
  84. #define arg2 rsi
  85. #define arg3 rdx
  86. #define arg4 rcx
  87. #define arg5 r8
  88. #define arg6 r9
  89. #define arg7 STACK_OFFSET+8(%r14)
  90. #define arg8 STACK_OFFSET+16(%r14)
  91. #define arg9 STACK_OFFSET+24(%r14)
  92. #define arg10 STACK_OFFSET+32(%r14)
  93. #define keysize 2*15*16(%arg1)
  94. #endif
  95. #define STATE1 %xmm0
  96. #define STATE2 %xmm4
  97. #define STATE3 %xmm5
  98. #define STATE4 %xmm6
  99. #define STATE STATE1
  100. #define IN1 %xmm1
  101. #define IN2 %xmm7
  102. #define IN3 %xmm8
  103. #define IN4 %xmm9
  104. #define IN IN1
  105. #define KEY %xmm2
  106. #define IV %xmm3
  107. #define BSWAP_MASK %xmm10
  108. #define CTR %xmm11
  109. #define INC %xmm12
  110. #define GF128MUL_MASK %xmm10
  111. #ifdef __x86_64__
  112. #define AREG %rax
  113. #define KEYP %rdi
  114. #define OUTP %rsi
  115. #define UKEYP OUTP
  116. #define INP %rdx
  117. #define LEN %rcx
  118. #define IVP %r8
  119. #define KLEN %r9d
  120. #define T1 %r10
  121. #define TKEYP T1
  122. #define T2 %r11
  123. #define TCTR_LOW T2
  124. #else
  125. #define AREG %eax
  126. #define KEYP %edi
  127. #define OUTP AREG
  128. #define UKEYP OUTP
  129. #define INP %edx
  130. #define LEN %esi
  131. #define IVP %ebp
  132. #define KLEN %ebx
  133. #define T1 %ecx
  134. #define TKEYP T1
  135. #endif
  136. #ifdef __x86_64__
  137. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  138. *
  139. *
  140. * Input: A and B (128-bits each, bit-reflected)
  141. * Output: C = A*B*x mod poly, (i.e. >>1 )
  142. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  143. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  144. *
  145. */
  146. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  147. movdqa \GH, \TMP1
  148. pshufd $78, \GH, \TMP2
  149. pshufd $78, \HK, \TMP3
  150. pxor \GH, \TMP2 # TMP2 = a1+a0
  151. pxor \HK, \TMP3 # TMP3 = b1+b0
  152. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  153. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  154. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  155. pxor \GH, \TMP2
  156. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  157. movdqa \TMP2, \TMP3
  158. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  159. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  160. pxor \TMP3, \GH
  161. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  162. # first phase of the reduction
  163. movdqa \GH, \TMP2
  164. movdqa \GH, \TMP3
  165. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  166. # in in order to perform
  167. # independent shifts
  168. pslld $31, \TMP2 # packed right shift <<31
  169. pslld $30, \TMP3 # packed right shift <<30
  170. pslld $25, \TMP4 # packed right shift <<25
  171. pxor \TMP3, \TMP2 # xor the shifted versions
  172. pxor \TMP4, \TMP2
  173. movdqa \TMP2, \TMP5
  174. psrldq $4, \TMP5 # right shift TMP5 1 DW
  175. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  176. pxor \TMP2, \GH
  177. # second phase of the reduction
  178. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  179. # in in order to perform
  180. # independent shifts
  181. movdqa \GH,\TMP3
  182. movdqa \GH,\TMP4
  183. psrld $1,\TMP2 # packed left shift >>1
  184. psrld $2,\TMP3 # packed left shift >>2
  185. psrld $7,\TMP4 # packed left shift >>7
  186. pxor \TMP3,\TMP2 # xor the shifted versions
  187. pxor \TMP4,\TMP2
  188. pxor \TMP5, \TMP2
  189. pxor \TMP2, \GH
  190. pxor \TMP1, \GH # result is in TMP1
  191. .endm
  192. /*
  193. * if a = number of total plaintext bytes
  194. * b = floor(a/16)
  195. * num_initial_blocks = b mod 4
  196. * encrypt the initial num_initial_blocks blocks and apply ghash on
  197. * the ciphertext
  198. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  199. * are clobbered
  200. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  201. */
  202. .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  203. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  204. MOVADQ SHUF_MASK(%rip), %xmm14
  205. mov arg7, %r10 # %r10 = AAD
  206. mov arg8, %r12 # %r12 = aadLen
  207. mov %r12, %r11
  208. pxor %xmm\i, %xmm\i
  209. _get_AAD_loop\num_initial_blocks\operation:
  210. movd (%r10), \TMP1
  211. pslldq $12, \TMP1
  212. psrldq $4, %xmm\i
  213. pxor \TMP1, %xmm\i
  214. add $4, %r10
  215. sub $4, %r12
  216. jne _get_AAD_loop\num_initial_blocks\operation
  217. cmp $16, %r11
  218. je _get_AAD_loop2_done\num_initial_blocks\operation
  219. mov $16, %r12
  220. _get_AAD_loop2\num_initial_blocks\operation:
  221. psrldq $4, %xmm\i
  222. sub $4, %r12
  223. cmp %r11, %r12
  224. jne _get_AAD_loop2\num_initial_blocks\operation
  225. _get_AAD_loop2_done\num_initial_blocks\operation:
  226. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  227. xor %r11, %r11 # initialise the data pointer offset as zero
  228. # start AES for num_initial_blocks blocks
  229. mov %arg5, %rax # %rax = *Y0
  230. movdqu (%rax), \XMM0 # XMM0 = Y0
  231. PSHUFB_XMM %xmm14, \XMM0
  232. .if (\i == 5) || (\i == 6) || (\i == 7)
  233. MOVADQ ONE(%RIP),\TMP1
  234. MOVADQ (%arg1),\TMP2
  235. .irpc index, \i_seq
  236. paddd \TMP1, \XMM0 # INCR Y0
  237. movdqa \XMM0, %xmm\index
  238. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  239. pxor \TMP2, %xmm\index
  240. .endr
  241. lea 0x10(%arg1),%r10
  242. mov keysize,%eax
  243. shr $2,%eax # 128->4, 192->6, 256->8
  244. add $5,%eax # 128->9, 192->11, 256->13
  245. aes_loop_initial_dec\num_initial_blocks:
  246. MOVADQ (%r10),\TMP1
  247. .irpc index, \i_seq
  248. AESENC \TMP1, %xmm\index
  249. .endr
  250. add $16,%r10
  251. sub $1,%eax
  252. jnz aes_loop_initial_dec\num_initial_blocks
  253. MOVADQ (%r10), \TMP1
  254. .irpc index, \i_seq
  255. AESENCLAST \TMP1, %xmm\index # Last Round
  256. .endr
  257. .irpc index, \i_seq
  258. movdqu (%arg3 , %r11, 1), \TMP1
  259. pxor \TMP1, %xmm\index
  260. movdqu %xmm\index, (%arg2 , %r11, 1)
  261. # write back plaintext/ciphertext for num_initial_blocks
  262. add $16, %r11
  263. movdqa \TMP1, %xmm\index
  264. PSHUFB_XMM %xmm14, %xmm\index
  265. # prepare plaintext/ciphertext for GHASH computation
  266. .endr
  267. .endif
  268. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  269. # apply GHASH on num_initial_blocks blocks
  270. .if \i == 5
  271. pxor %xmm5, %xmm6
  272. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  273. pxor %xmm6, %xmm7
  274. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  275. pxor %xmm7, %xmm8
  276. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  277. .elseif \i == 6
  278. pxor %xmm6, %xmm7
  279. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  280. pxor %xmm7, %xmm8
  281. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  282. .elseif \i == 7
  283. pxor %xmm7, %xmm8
  284. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  285. .endif
  286. cmp $64, %r13
  287. jl _initial_blocks_done\num_initial_blocks\operation
  288. # no need for precomputed values
  289. /*
  290. *
  291. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  292. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  293. */
  294. MOVADQ ONE(%rip), \TMP1
  295. paddd \TMP1, \XMM0 # INCR Y0
  296. MOVADQ \XMM0, \XMM1
  297. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  298. paddd \TMP1, \XMM0 # INCR Y0
  299. MOVADQ \XMM0, \XMM2
  300. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  301. paddd \TMP1, \XMM0 # INCR Y0
  302. MOVADQ \XMM0, \XMM3
  303. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  304. paddd \TMP1, \XMM0 # INCR Y0
  305. MOVADQ \XMM0, \XMM4
  306. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  307. MOVADQ 0(%arg1),\TMP1
  308. pxor \TMP1, \XMM1
  309. pxor \TMP1, \XMM2
  310. pxor \TMP1, \XMM3
  311. pxor \TMP1, \XMM4
  312. movdqa \TMP3, \TMP5
  313. pshufd $78, \TMP3, \TMP1
  314. pxor \TMP3, \TMP1
  315. movdqa \TMP1, HashKey_k(%rsp)
  316. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  317. # TMP5 = HashKey^2<<1 (mod poly)
  318. movdqa \TMP5, HashKey_2(%rsp)
  319. # HashKey_2 = HashKey^2<<1 (mod poly)
  320. pshufd $78, \TMP5, \TMP1
  321. pxor \TMP5, \TMP1
  322. movdqa \TMP1, HashKey_2_k(%rsp)
  323. .irpc index, 1234 # do 4 rounds
  324. movaps 0x10*\index(%arg1), \TMP1
  325. AESENC \TMP1, \XMM1
  326. AESENC \TMP1, \XMM2
  327. AESENC \TMP1, \XMM3
  328. AESENC \TMP1, \XMM4
  329. .endr
  330. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  331. # TMP5 = HashKey^3<<1 (mod poly)
  332. movdqa \TMP5, HashKey_3(%rsp)
  333. pshufd $78, \TMP5, \TMP1
  334. pxor \TMP5, \TMP1
  335. movdqa \TMP1, HashKey_3_k(%rsp)
  336. .irpc index, 56789 # do next 5 rounds
  337. movaps 0x10*\index(%arg1), \TMP1
  338. AESENC \TMP1, \XMM1
  339. AESENC \TMP1, \XMM2
  340. AESENC \TMP1, \XMM3
  341. AESENC \TMP1, \XMM4
  342. .endr
  343. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  344. # TMP5 = HashKey^3<<1 (mod poly)
  345. movdqa \TMP5, HashKey_4(%rsp)
  346. pshufd $78, \TMP5, \TMP1
  347. pxor \TMP5, \TMP1
  348. movdqa \TMP1, HashKey_4_k(%rsp)
  349. lea 0xa0(%arg1),%r10
  350. mov keysize,%eax
  351. shr $2,%eax # 128->4, 192->6, 256->8
  352. sub $4,%eax # 128->0, 192->2, 256->4
  353. jz aes_loop_pre_dec_done\num_initial_blocks
  354. aes_loop_pre_dec\num_initial_blocks:
  355. MOVADQ (%r10),\TMP2
  356. .irpc index, 1234
  357. AESENC \TMP2, %xmm\index
  358. .endr
  359. add $16,%r10
  360. sub $1,%eax
  361. jnz aes_loop_pre_dec\num_initial_blocks
  362. aes_loop_pre_dec_done\num_initial_blocks:
  363. MOVADQ (%r10), \TMP2
  364. AESENCLAST \TMP2, \XMM1
  365. AESENCLAST \TMP2, \XMM2
  366. AESENCLAST \TMP2, \XMM3
  367. AESENCLAST \TMP2, \XMM4
  368. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  369. pxor \TMP1, \XMM1
  370. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  371. movdqa \TMP1, \XMM1
  372. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  373. pxor \TMP1, \XMM2
  374. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  375. movdqa \TMP1, \XMM2
  376. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  377. pxor \TMP1, \XMM3
  378. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  379. movdqa \TMP1, \XMM3
  380. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  381. pxor \TMP1, \XMM4
  382. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  383. movdqa \TMP1, \XMM4
  384. add $64, %r11
  385. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  386. pxor \XMMDst, \XMM1
  387. # combine GHASHed value with the corresponding ciphertext
  388. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  389. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  390. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  391. _initial_blocks_done\num_initial_blocks\operation:
  392. .endm
  393. /*
  394. * if a = number of total plaintext bytes
  395. * b = floor(a/16)
  396. * num_initial_blocks = b mod 4
  397. * encrypt the initial num_initial_blocks blocks and apply ghash on
  398. * the ciphertext
  399. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  400. * are clobbered
  401. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  402. */
  403. .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  404. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  405. MOVADQ SHUF_MASK(%rip), %xmm14
  406. mov arg7, %r10 # %r10 = AAD
  407. mov arg8, %r12 # %r12 = aadLen
  408. mov %r12, %r11
  409. pxor %xmm\i, %xmm\i
  410. _get_AAD_loop\num_initial_blocks\operation:
  411. movd (%r10), \TMP1
  412. pslldq $12, \TMP1
  413. psrldq $4, %xmm\i
  414. pxor \TMP1, %xmm\i
  415. add $4, %r10
  416. sub $4, %r12
  417. jne _get_AAD_loop\num_initial_blocks\operation
  418. cmp $16, %r11
  419. je _get_AAD_loop2_done\num_initial_blocks\operation
  420. mov $16, %r12
  421. _get_AAD_loop2\num_initial_blocks\operation:
  422. psrldq $4, %xmm\i
  423. sub $4, %r12
  424. cmp %r11, %r12
  425. jne _get_AAD_loop2\num_initial_blocks\operation
  426. _get_AAD_loop2_done\num_initial_blocks\operation:
  427. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  428. xor %r11, %r11 # initialise the data pointer offset as zero
  429. # start AES for num_initial_blocks blocks
  430. mov %arg5, %rax # %rax = *Y0
  431. movdqu (%rax), \XMM0 # XMM0 = Y0
  432. PSHUFB_XMM %xmm14, \XMM0
  433. .if (\i == 5) || (\i == 6) || (\i == 7)
  434. MOVADQ ONE(%RIP),\TMP1
  435. MOVADQ 0(%arg1),\TMP2
  436. .irpc index, \i_seq
  437. paddd \TMP1, \XMM0 # INCR Y0
  438. MOVADQ \XMM0, %xmm\index
  439. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  440. pxor \TMP2, %xmm\index
  441. .endr
  442. lea 0x10(%arg1),%r10
  443. mov keysize,%eax
  444. shr $2,%eax # 128->4, 192->6, 256->8
  445. add $5,%eax # 128->9, 192->11, 256->13
  446. aes_loop_initial_enc\num_initial_blocks:
  447. MOVADQ (%r10),\TMP1
  448. .irpc index, \i_seq
  449. AESENC \TMP1, %xmm\index
  450. .endr
  451. add $16,%r10
  452. sub $1,%eax
  453. jnz aes_loop_initial_enc\num_initial_blocks
  454. MOVADQ (%r10), \TMP1
  455. .irpc index, \i_seq
  456. AESENCLAST \TMP1, %xmm\index # Last Round
  457. .endr
  458. .irpc index, \i_seq
  459. movdqu (%arg3 , %r11, 1), \TMP1
  460. pxor \TMP1, %xmm\index
  461. movdqu %xmm\index, (%arg2 , %r11, 1)
  462. # write back plaintext/ciphertext for num_initial_blocks
  463. add $16, %r11
  464. PSHUFB_XMM %xmm14, %xmm\index
  465. # prepare plaintext/ciphertext for GHASH computation
  466. .endr
  467. .endif
  468. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  469. # apply GHASH on num_initial_blocks blocks
  470. .if \i == 5
  471. pxor %xmm5, %xmm6
  472. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  473. pxor %xmm6, %xmm7
  474. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  475. pxor %xmm7, %xmm8
  476. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  477. .elseif \i == 6
  478. pxor %xmm6, %xmm7
  479. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  480. pxor %xmm7, %xmm8
  481. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  482. .elseif \i == 7
  483. pxor %xmm7, %xmm8
  484. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  485. .endif
  486. cmp $64, %r13
  487. jl _initial_blocks_done\num_initial_blocks\operation
  488. # no need for precomputed values
  489. /*
  490. *
  491. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  492. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  493. */
  494. MOVADQ ONE(%RIP),\TMP1
  495. paddd \TMP1, \XMM0 # INCR Y0
  496. MOVADQ \XMM0, \XMM1
  497. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  498. paddd \TMP1, \XMM0 # INCR Y0
  499. MOVADQ \XMM0, \XMM2
  500. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  501. paddd \TMP1, \XMM0 # INCR Y0
  502. MOVADQ \XMM0, \XMM3
  503. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  504. paddd \TMP1, \XMM0 # INCR Y0
  505. MOVADQ \XMM0, \XMM4
  506. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  507. MOVADQ 0(%arg1),\TMP1
  508. pxor \TMP1, \XMM1
  509. pxor \TMP1, \XMM2
  510. pxor \TMP1, \XMM3
  511. pxor \TMP1, \XMM4
  512. movdqa \TMP3, \TMP5
  513. pshufd $78, \TMP3, \TMP1
  514. pxor \TMP3, \TMP1
  515. movdqa \TMP1, HashKey_k(%rsp)
  516. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  517. # TMP5 = HashKey^2<<1 (mod poly)
  518. movdqa \TMP5, HashKey_2(%rsp)
  519. # HashKey_2 = HashKey^2<<1 (mod poly)
  520. pshufd $78, \TMP5, \TMP1
  521. pxor \TMP5, \TMP1
  522. movdqa \TMP1, HashKey_2_k(%rsp)
  523. .irpc index, 1234 # do 4 rounds
  524. movaps 0x10*\index(%arg1), \TMP1
  525. AESENC \TMP1, \XMM1
  526. AESENC \TMP1, \XMM2
  527. AESENC \TMP1, \XMM3
  528. AESENC \TMP1, \XMM4
  529. .endr
  530. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  531. # TMP5 = HashKey^3<<1 (mod poly)
  532. movdqa \TMP5, HashKey_3(%rsp)
  533. pshufd $78, \TMP5, \TMP1
  534. pxor \TMP5, \TMP1
  535. movdqa \TMP1, HashKey_3_k(%rsp)
  536. .irpc index, 56789 # do next 5 rounds
  537. movaps 0x10*\index(%arg1), \TMP1
  538. AESENC \TMP1, \XMM1
  539. AESENC \TMP1, \XMM2
  540. AESENC \TMP1, \XMM3
  541. AESENC \TMP1, \XMM4
  542. .endr
  543. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  544. # TMP5 = HashKey^3<<1 (mod poly)
  545. movdqa \TMP5, HashKey_4(%rsp)
  546. pshufd $78, \TMP5, \TMP1
  547. pxor \TMP5, \TMP1
  548. movdqa \TMP1, HashKey_4_k(%rsp)
  549. lea 0xa0(%arg1),%r10
  550. mov keysize,%eax
  551. shr $2,%eax # 128->4, 192->6, 256->8
  552. sub $4,%eax # 128->0, 192->2, 256->4
  553. jz aes_loop_pre_enc_done\num_initial_blocks
  554. aes_loop_pre_enc\num_initial_blocks:
  555. MOVADQ (%r10),\TMP2
  556. .irpc index, 1234
  557. AESENC \TMP2, %xmm\index
  558. .endr
  559. add $16,%r10
  560. sub $1,%eax
  561. jnz aes_loop_pre_enc\num_initial_blocks
  562. aes_loop_pre_enc_done\num_initial_blocks:
  563. MOVADQ (%r10), \TMP2
  564. AESENCLAST \TMP2, \XMM1
  565. AESENCLAST \TMP2, \XMM2
  566. AESENCLAST \TMP2, \XMM3
  567. AESENCLAST \TMP2, \XMM4
  568. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  569. pxor \TMP1, \XMM1
  570. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  571. pxor \TMP1, \XMM2
  572. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  573. pxor \TMP1, \XMM3
  574. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  575. pxor \TMP1, \XMM4
  576. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  577. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  578. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  579. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  580. add $64, %r11
  581. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  582. pxor \XMMDst, \XMM1
  583. # combine GHASHed value with the corresponding ciphertext
  584. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  585. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  586. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  587. _initial_blocks_done\num_initial_blocks\operation:
  588. .endm
  589. /*
  590. * encrypt 4 blocks at a time
  591. * ghash the 4 previously encrypted ciphertext blocks
  592. * arg1, %arg2, %arg3 are used as pointers only, not modified
  593. * %r11 is the data offset value
  594. */
  595. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  596. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  597. movdqa \XMM1, \XMM5
  598. movdqa \XMM2, \XMM6
  599. movdqa \XMM3, \XMM7
  600. movdqa \XMM4, \XMM8
  601. movdqa SHUF_MASK(%rip), %xmm15
  602. # multiply TMP5 * HashKey using karatsuba
  603. movdqa \XMM5, \TMP4
  604. pshufd $78, \XMM5, \TMP6
  605. pxor \XMM5, \TMP6
  606. paddd ONE(%rip), \XMM0 # INCR CNT
  607. movdqa HashKey_4(%rsp), \TMP5
  608. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  609. movdqa \XMM0, \XMM1
  610. paddd ONE(%rip), \XMM0 # INCR CNT
  611. movdqa \XMM0, \XMM2
  612. paddd ONE(%rip), \XMM0 # INCR CNT
  613. movdqa \XMM0, \XMM3
  614. paddd ONE(%rip), \XMM0 # INCR CNT
  615. movdqa \XMM0, \XMM4
  616. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  617. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  618. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  619. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  620. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  621. pxor (%arg1), \XMM1
  622. pxor (%arg1), \XMM2
  623. pxor (%arg1), \XMM3
  624. pxor (%arg1), \XMM4
  625. movdqa HashKey_4_k(%rsp), \TMP5
  626. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  627. movaps 0x10(%arg1), \TMP1
  628. AESENC \TMP1, \XMM1 # Round 1
  629. AESENC \TMP1, \XMM2
  630. AESENC \TMP1, \XMM3
  631. AESENC \TMP1, \XMM4
  632. movaps 0x20(%arg1), \TMP1
  633. AESENC \TMP1, \XMM1 # Round 2
  634. AESENC \TMP1, \XMM2
  635. AESENC \TMP1, \XMM3
  636. AESENC \TMP1, \XMM4
  637. movdqa \XMM6, \TMP1
  638. pshufd $78, \XMM6, \TMP2
  639. pxor \XMM6, \TMP2
  640. movdqa HashKey_3(%rsp), \TMP5
  641. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  642. movaps 0x30(%arg1), \TMP3
  643. AESENC \TMP3, \XMM1 # Round 3
  644. AESENC \TMP3, \XMM2
  645. AESENC \TMP3, \XMM3
  646. AESENC \TMP3, \XMM4
  647. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  648. movaps 0x40(%arg1), \TMP3
  649. AESENC \TMP3, \XMM1 # Round 4
  650. AESENC \TMP3, \XMM2
  651. AESENC \TMP3, \XMM3
  652. AESENC \TMP3, \XMM4
  653. movdqa HashKey_3_k(%rsp), \TMP5
  654. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  655. movaps 0x50(%arg1), \TMP3
  656. AESENC \TMP3, \XMM1 # Round 5
  657. AESENC \TMP3, \XMM2
  658. AESENC \TMP3, \XMM3
  659. AESENC \TMP3, \XMM4
  660. pxor \TMP1, \TMP4
  661. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  662. pxor \XMM6, \XMM5
  663. pxor \TMP2, \TMP6
  664. movdqa \XMM7, \TMP1
  665. pshufd $78, \XMM7, \TMP2
  666. pxor \XMM7, \TMP2
  667. movdqa HashKey_2(%rsp ), \TMP5
  668. # Multiply TMP5 * HashKey using karatsuba
  669. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  670. movaps 0x60(%arg1), \TMP3
  671. AESENC \TMP3, \XMM1 # Round 6
  672. AESENC \TMP3, \XMM2
  673. AESENC \TMP3, \XMM3
  674. AESENC \TMP3, \XMM4
  675. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  676. movaps 0x70(%arg1), \TMP3
  677. AESENC \TMP3, \XMM1 # Round 7
  678. AESENC \TMP3, \XMM2
  679. AESENC \TMP3, \XMM3
  680. AESENC \TMP3, \XMM4
  681. movdqa HashKey_2_k(%rsp), \TMP5
  682. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  683. movaps 0x80(%arg1), \TMP3
  684. AESENC \TMP3, \XMM1 # Round 8
  685. AESENC \TMP3, \XMM2
  686. AESENC \TMP3, \XMM3
  687. AESENC \TMP3, \XMM4
  688. pxor \TMP1, \TMP4
  689. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  690. pxor \XMM7, \XMM5
  691. pxor \TMP2, \TMP6
  692. # Multiply XMM8 * HashKey
  693. # XMM8 and TMP5 hold the values for the two operands
  694. movdqa \XMM8, \TMP1
  695. pshufd $78, \XMM8, \TMP2
  696. pxor \XMM8, \TMP2
  697. movdqa HashKey(%rsp), \TMP5
  698. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  699. movaps 0x90(%arg1), \TMP3
  700. AESENC \TMP3, \XMM1 # Round 9
  701. AESENC \TMP3, \XMM2
  702. AESENC \TMP3, \XMM3
  703. AESENC \TMP3, \XMM4
  704. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  705. lea 0xa0(%arg1),%r10
  706. mov keysize,%eax
  707. shr $2,%eax # 128->4, 192->6, 256->8
  708. sub $4,%eax # 128->0, 192->2, 256->4
  709. jz aes_loop_par_enc_done
  710. aes_loop_par_enc:
  711. MOVADQ (%r10),\TMP3
  712. .irpc index, 1234
  713. AESENC \TMP3, %xmm\index
  714. .endr
  715. add $16,%r10
  716. sub $1,%eax
  717. jnz aes_loop_par_enc
  718. aes_loop_par_enc_done:
  719. MOVADQ (%r10), \TMP3
  720. AESENCLAST \TMP3, \XMM1 # Round 10
  721. AESENCLAST \TMP3, \XMM2
  722. AESENCLAST \TMP3, \XMM3
  723. AESENCLAST \TMP3, \XMM4
  724. movdqa HashKey_k(%rsp), \TMP5
  725. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  726. movdqu (%arg3,%r11,1), \TMP3
  727. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  728. movdqu 16(%arg3,%r11,1), \TMP3
  729. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  730. movdqu 32(%arg3,%r11,1), \TMP3
  731. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  732. movdqu 48(%arg3,%r11,1), \TMP3
  733. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  734. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  735. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  736. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  737. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  738. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  739. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  740. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  741. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  742. pxor \TMP4, \TMP1
  743. pxor \XMM8, \XMM5
  744. pxor \TMP6, \TMP2
  745. pxor \TMP1, \TMP2
  746. pxor \XMM5, \TMP2
  747. movdqa \TMP2, \TMP3
  748. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  749. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  750. pxor \TMP3, \XMM5
  751. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  752. # first phase of reduction
  753. movdqa \XMM5, \TMP2
  754. movdqa \XMM5, \TMP3
  755. movdqa \XMM5, \TMP4
  756. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  757. pslld $31, \TMP2 # packed right shift << 31
  758. pslld $30, \TMP3 # packed right shift << 30
  759. pslld $25, \TMP4 # packed right shift << 25
  760. pxor \TMP3, \TMP2 # xor the shifted versions
  761. pxor \TMP4, \TMP2
  762. movdqa \TMP2, \TMP5
  763. psrldq $4, \TMP5 # right shift T5 1 DW
  764. pslldq $12, \TMP2 # left shift T2 3 DWs
  765. pxor \TMP2, \XMM5
  766. # second phase of reduction
  767. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  768. movdqa \XMM5,\TMP3
  769. movdqa \XMM5,\TMP4
  770. psrld $1, \TMP2 # packed left shift >>1
  771. psrld $2, \TMP3 # packed left shift >>2
  772. psrld $7, \TMP4 # packed left shift >>7
  773. pxor \TMP3,\TMP2 # xor the shifted versions
  774. pxor \TMP4,\TMP2
  775. pxor \TMP5, \TMP2
  776. pxor \TMP2, \XMM5
  777. pxor \TMP1, \XMM5 # result is in TMP1
  778. pxor \XMM5, \XMM1
  779. .endm
  780. /*
  781. * decrypt 4 blocks at a time
  782. * ghash the 4 previously decrypted ciphertext blocks
  783. * arg1, %arg2, %arg3 are used as pointers only, not modified
  784. * %r11 is the data offset value
  785. */
  786. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  787. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  788. movdqa \XMM1, \XMM5
  789. movdqa \XMM2, \XMM6
  790. movdqa \XMM3, \XMM7
  791. movdqa \XMM4, \XMM8
  792. movdqa SHUF_MASK(%rip), %xmm15
  793. # multiply TMP5 * HashKey using karatsuba
  794. movdqa \XMM5, \TMP4
  795. pshufd $78, \XMM5, \TMP6
  796. pxor \XMM5, \TMP6
  797. paddd ONE(%rip), \XMM0 # INCR CNT
  798. movdqa HashKey_4(%rsp), \TMP5
  799. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  800. movdqa \XMM0, \XMM1
  801. paddd ONE(%rip), \XMM0 # INCR CNT
  802. movdqa \XMM0, \XMM2
  803. paddd ONE(%rip), \XMM0 # INCR CNT
  804. movdqa \XMM0, \XMM3
  805. paddd ONE(%rip), \XMM0 # INCR CNT
  806. movdqa \XMM0, \XMM4
  807. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  808. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  809. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  810. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  811. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  812. pxor (%arg1), \XMM1
  813. pxor (%arg1), \XMM2
  814. pxor (%arg1), \XMM3
  815. pxor (%arg1), \XMM4
  816. movdqa HashKey_4_k(%rsp), \TMP5
  817. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  818. movaps 0x10(%arg1), \TMP1
  819. AESENC \TMP1, \XMM1 # Round 1
  820. AESENC \TMP1, \XMM2
  821. AESENC \TMP1, \XMM3
  822. AESENC \TMP1, \XMM4
  823. movaps 0x20(%arg1), \TMP1
  824. AESENC \TMP1, \XMM1 # Round 2
  825. AESENC \TMP1, \XMM2
  826. AESENC \TMP1, \XMM3
  827. AESENC \TMP1, \XMM4
  828. movdqa \XMM6, \TMP1
  829. pshufd $78, \XMM6, \TMP2
  830. pxor \XMM6, \TMP2
  831. movdqa HashKey_3(%rsp), \TMP5
  832. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  833. movaps 0x30(%arg1), \TMP3
  834. AESENC \TMP3, \XMM1 # Round 3
  835. AESENC \TMP3, \XMM2
  836. AESENC \TMP3, \XMM3
  837. AESENC \TMP3, \XMM4
  838. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  839. movaps 0x40(%arg1), \TMP3
  840. AESENC \TMP3, \XMM1 # Round 4
  841. AESENC \TMP3, \XMM2
  842. AESENC \TMP3, \XMM3
  843. AESENC \TMP3, \XMM4
  844. movdqa HashKey_3_k(%rsp), \TMP5
  845. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  846. movaps 0x50(%arg1), \TMP3
  847. AESENC \TMP3, \XMM1 # Round 5
  848. AESENC \TMP3, \XMM2
  849. AESENC \TMP3, \XMM3
  850. AESENC \TMP3, \XMM4
  851. pxor \TMP1, \TMP4
  852. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  853. pxor \XMM6, \XMM5
  854. pxor \TMP2, \TMP6
  855. movdqa \XMM7, \TMP1
  856. pshufd $78, \XMM7, \TMP2
  857. pxor \XMM7, \TMP2
  858. movdqa HashKey_2(%rsp ), \TMP5
  859. # Multiply TMP5 * HashKey using karatsuba
  860. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  861. movaps 0x60(%arg1), \TMP3
  862. AESENC \TMP3, \XMM1 # Round 6
  863. AESENC \TMP3, \XMM2
  864. AESENC \TMP3, \XMM3
  865. AESENC \TMP3, \XMM4
  866. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  867. movaps 0x70(%arg1), \TMP3
  868. AESENC \TMP3, \XMM1 # Round 7
  869. AESENC \TMP3, \XMM2
  870. AESENC \TMP3, \XMM3
  871. AESENC \TMP3, \XMM4
  872. movdqa HashKey_2_k(%rsp), \TMP5
  873. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  874. movaps 0x80(%arg1), \TMP3
  875. AESENC \TMP3, \XMM1 # Round 8
  876. AESENC \TMP3, \XMM2
  877. AESENC \TMP3, \XMM3
  878. AESENC \TMP3, \XMM4
  879. pxor \TMP1, \TMP4
  880. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  881. pxor \XMM7, \XMM5
  882. pxor \TMP2, \TMP6
  883. # Multiply XMM8 * HashKey
  884. # XMM8 and TMP5 hold the values for the two operands
  885. movdqa \XMM8, \TMP1
  886. pshufd $78, \XMM8, \TMP2
  887. pxor \XMM8, \TMP2
  888. movdqa HashKey(%rsp), \TMP5
  889. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  890. movaps 0x90(%arg1), \TMP3
  891. AESENC \TMP3, \XMM1 # Round 9
  892. AESENC \TMP3, \XMM2
  893. AESENC \TMP3, \XMM3
  894. AESENC \TMP3, \XMM4
  895. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  896. lea 0xa0(%arg1),%r10
  897. mov keysize,%eax
  898. shr $2,%eax # 128->4, 192->6, 256->8
  899. sub $4,%eax # 128->0, 192->2, 256->4
  900. jz aes_loop_par_dec_done
  901. aes_loop_par_dec:
  902. MOVADQ (%r10),\TMP3
  903. .irpc index, 1234
  904. AESENC \TMP3, %xmm\index
  905. .endr
  906. add $16,%r10
  907. sub $1,%eax
  908. jnz aes_loop_par_dec
  909. aes_loop_par_dec_done:
  910. MOVADQ (%r10), \TMP3
  911. AESENCLAST \TMP3, \XMM1 # last round
  912. AESENCLAST \TMP3, \XMM2
  913. AESENCLAST \TMP3, \XMM3
  914. AESENCLAST \TMP3, \XMM4
  915. movdqa HashKey_k(%rsp), \TMP5
  916. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  917. movdqu (%arg3,%r11,1), \TMP3
  918. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  919. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  920. movdqa \TMP3, \XMM1
  921. movdqu 16(%arg3,%r11,1), \TMP3
  922. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  923. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  924. movdqa \TMP3, \XMM2
  925. movdqu 32(%arg3,%r11,1), \TMP3
  926. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  927. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  928. movdqa \TMP3, \XMM3
  929. movdqu 48(%arg3,%r11,1), \TMP3
  930. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  931. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  932. movdqa \TMP3, \XMM4
  933. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  934. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  935. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  936. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  937. pxor \TMP4, \TMP1
  938. pxor \XMM8, \XMM5
  939. pxor \TMP6, \TMP2
  940. pxor \TMP1, \TMP2
  941. pxor \XMM5, \TMP2
  942. movdqa \TMP2, \TMP3
  943. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  944. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  945. pxor \TMP3, \XMM5
  946. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  947. # first phase of reduction
  948. movdqa \XMM5, \TMP2
  949. movdqa \XMM5, \TMP3
  950. movdqa \XMM5, \TMP4
  951. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  952. pslld $31, \TMP2 # packed right shift << 31
  953. pslld $30, \TMP3 # packed right shift << 30
  954. pslld $25, \TMP4 # packed right shift << 25
  955. pxor \TMP3, \TMP2 # xor the shifted versions
  956. pxor \TMP4, \TMP2
  957. movdqa \TMP2, \TMP5
  958. psrldq $4, \TMP5 # right shift T5 1 DW
  959. pslldq $12, \TMP2 # left shift T2 3 DWs
  960. pxor \TMP2, \XMM5
  961. # second phase of reduction
  962. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  963. movdqa \XMM5,\TMP3
  964. movdqa \XMM5,\TMP4
  965. psrld $1, \TMP2 # packed left shift >>1
  966. psrld $2, \TMP3 # packed left shift >>2
  967. psrld $7, \TMP4 # packed left shift >>7
  968. pxor \TMP3,\TMP2 # xor the shifted versions
  969. pxor \TMP4,\TMP2
  970. pxor \TMP5, \TMP2
  971. pxor \TMP2, \XMM5
  972. pxor \TMP1, \XMM5 # result is in TMP1
  973. pxor \XMM5, \XMM1
  974. .endm
  975. /* GHASH the last 4 ciphertext blocks. */
  976. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  977. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  978. # Multiply TMP6 * HashKey (using Karatsuba)
  979. movdqa \XMM1, \TMP6
  980. pshufd $78, \XMM1, \TMP2
  981. pxor \XMM1, \TMP2
  982. movdqa HashKey_4(%rsp), \TMP5
  983. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  984. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  985. movdqa HashKey_4_k(%rsp), \TMP4
  986. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  987. movdqa \XMM1, \XMMDst
  988. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  989. # Multiply TMP1 * HashKey (using Karatsuba)
  990. movdqa \XMM2, \TMP1
  991. pshufd $78, \XMM2, \TMP2
  992. pxor \XMM2, \TMP2
  993. movdqa HashKey_3(%rsp), \TMP5
  994. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  995. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  996. movdqa HashKey_3_k(%rsp), \TMP4
  997. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  998. pxor \TMP1, \TMP6
  999. pxor \XMM2, \XMMDst
  1000. pxor \TMP2, \XMM1
  1001. # results accumulated in TMP6, XMMDst, XMM1
  1002. # Multiply TMP1 * HashKey (using Karatsuba)
  1003. movdqa \XMM3, \TMP1
  1004. pshufd $78, \XMM3, \TMP2
  1005. pxor \XMM3, \TMP2
  1006. movdqa HashKey_2(%rsp), \TMP5
  1007. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1008. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1009. movdqa HashKey_2_k(%rsp), \TMP4
  1010. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1011. pxor \TMP1, \TMP6
  1012. pxor \XMM3, \XMMDst
  1013. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1014. # Multiply TMP1 * HashKey (using Karatsuba)
  1015. movdqa \XMM4, \TMP1
  1016. pshufd $78, \XMM4, \TMP2
  1017. pxor \XMM4, \TMP2
  1018. movdqa HashKey(%rsp), \TMP5
  1019. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1020. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1021. movdqa HashKey_k(%rsp), \TMP4
  1022. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1023. pxor \TMP1, \TMP6
  1024. pxor \XMM4, \XMMDst
  1025. pxor \XMM1, \TMP2
  1026. pxor \TMP6, \TMP2
  1027. pxor \XMMDst, \TMP2
  1028. # middle section of the temp results combined as in karatsuba algorithm
  1029. movdqa \TMP2, \TMP4
  1030. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1031. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1032. pxor \TMP4, \XMMDst
  1033. pxor \TMP2, \TMP6
  1034. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1035. # first phase of the reduction
  1036. movdqa \XMMDst, \TMP2
  1037. movdqa \XMMDst, \TMP3
  1038. movdqa \XMMDst, \TMP4
  1039. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1040. pslld $31, \TMP2 # packed right shifting << 31
  1041. pslld $30, \TMP3 # packed right shifting << 30
  1042. pslld $25, \TMP4 # packed right shifting << 25
  1043. pxor \TMP3, \TMP2 # xor the shifted versions
  1044. pxor \TMP4, \TMP2
  1045. movdqa \TMP2, \TMP7
  1046. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1047. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1048. pxor \TMP2, \XMMDst
  1049. # second phase of the reduction
  1050. movdqa \XMMDst, \TMP2
  1051. # make 3 copies of XMMDst for doing 3 shift operations
  1052. movdqa \XMMDst, \TMP3
  1053. movdqa \XMMDst, \TMP4
  1054. psrld $1, \TMP2 # packed left shift >> 1
  1055. psrld $2, \TMP3 # packed left shift >> 2
  1056. psrld $7, \TMP4 # packed left shift >> 7
  1057. pxor \TMP3, \TMP2 # xor the shifted versions
  1058. pxor \TMP4, \TMP2
  1059. pxor \TMP7, \TMP2
  1060. pxor \TMP2, \XMMDst
  1061. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1062. .endm
  1063. /* Encryption of a single block
  1064. * uses eax & r10
  1065. */
  1066. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1067. pxor (%arg1), \XMM0
  1068. mov keysize,%eax
  1069. shr $2,%eax # 128->4, 192->6, 256->8
  1070. add $5,%eax # 128->9, 192->11, 256->13
  1071. lea 16(%arg1), %r10 # get first expanded key address
  1072. _esb_loop_\@:
  1073. MOVADQ (%r10),\TMP1
  1074. AESENC \TMP1,\XMM0
  1075. add $16,%r10
  1076. sub $1,%eax
  1077. jnz _esb_loop_\@
  1078. MOVADQ (%r10),\TMP1
  1079. AESENCLAST \TMP1,\XMM0
  1080. .endm
  1081. /*****************************************************************************
  1082. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1083. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1084. * const u8 *in, // Ciphertext input
  1085. * u64 plaintext_len, // Length of data in bytes for decryption.
  1086. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1087. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1088. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1089. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1090. * const u8 *aad, // Additional Authentication Data (AAD)
  1091. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1092. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1093. * // given authentication tag and only return the plaintext if they match.
  1094. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1095. * // (most likely), 12 or 8.
  1096. *
  1097. * Assumptions:
  1098. *
  1099. * keys:
  1100. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1101. * set of 11 keys in the data structure void *aes_ctx
  1102. *
  1103. * iv:
  1104. * 0 1 2 3
  1105. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1106. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1107. * | Salt (From the SA) |
  1108. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1109. * | Initialization Vector |
  1110. * | (This is the sequence number from IPSec header) |
  1111. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1112. * | 0x1 |
  1113. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1114. *
  1115. *
  1116. *
  1117. * AAD:
  1118. * AAD padded to 128 bits with 0
  1119. * for example, assume AAD is a u32 vector
  1120. *
  1121. * if AAD is 8 bytes:
  1122. * AAD[3] = {A0, A1};
  1123. * padded AAD in xmm register = {A1 A0 0 0}
  1124. *
  1125. * 0 1 2 3
  1126. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1127. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1128. * | SPI (A1) |
  1129. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1130. * | 32-bit Sequence Number (A0) |
  1131. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1132. * | 0x0 |
  1133. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1134. *
  1135. * AAD Format with 32-bit Sequence Number
  1136. *
  1137. * if AAD is 12 bytes:
  1138. * AAD[3] = {A0, A1, A2};
  1139. * padded AAD in xmm register = {A2 A1 A0 0}
  1140. *
  1141. * 0 1 2 3
  1142. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1143. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1144. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1145. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1146. * | SPI (A2) |
  1147. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1148. * | 64-bit Extended Sequence Number {A1,A0} |
  1149. * | |
  1150. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1151. * | 0x0 |
  1152. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1153. *
  1154. * AAD Format with 64-bit Extended Sequence Number
  1155. *
  1156. * aadLen:
  1157. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1158. * The code supports 16 too but for other sizes, the code will fail.
  1159. *
  1160. * TLen:
  1161. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1162. * For other sizes, the code will fail.
  1163. *
  1164. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1165. *
  1166. *****************************************************************************/
  1167. ENTRY(aesni_gcm_dec)
  1168. push %r12
  1169. push %r13
  1170. push %r14
  1171. mov %rsp, %r14
  1172. /*
  1173. * states of %xmm registers %xmm6:%xmm15 not saved
  1174. * all %xmm registers are clobbered
  1175. */
  1176. sub $VARIABLE_OFFSET, %rsp
  1177. and $~63, %rsp # align rsp to 64 bytes
  1178. mov %arg6, %r12
  1179. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  1180. movdqa SHUF_MASK(%rip), %xmm2
  1181. PSHUFB_XMM %xmm2, %xmm13
  1182. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  1183. movdqa %xmm13, %xmm2
  1184. psllq $1, %xmm13
  1185. psrlq $63, %xmm2
  1186. movdqa %xmm2, %xmm1
  1187. pslldq $8, %xmm2
  1188. psrldq $8, %xmm1
  1189. por %xmm2, %xmm13
  1190. # Reduction
  1191. pshufd $0x24, %xmm1, %xmm2
  1192. pcmpeqd TWOONE(%rip), %xmm2
  1193. pand POLY(%rip), %xmm2
  1194. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  1195. # Decrypt first few blocks
  1196. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  1197. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1198. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  1199. mov %r13, %r12
  1200. and $(3<<4), %r12
  1201. jz _initial_num_blocks_is_0_decrypt
  1202. cmp $(2<<4), %r12
  1203. jb _initial_num_blocks_is_1_decrypt
  1204. je _initial_num_blocks_is_2_decrypt
  1205. _initial_num_blocks_is_3_decrypt:
  1206. INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1207. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  1208. sub $48, %r13
  1209. jmp _initial_blocks_decrypted
  1210. _initial_num_blocks_is_2_decrypt:
  1211. INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1212. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  1213. sub $32, %r13
  1214. jmp _initial_blocks_decrypted
  1215. _initial_num_blocks_is_1_decrypt:
  1216. INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1217. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  1218. sub $16, %r13
  1219. jmp _initial_blocks_decrypted
  1220. _initial_num_blocks_is_0_decrypt:
  1221. INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1222. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  1223. _initial_blocks_decrypted:
  1224. cmp $0, %r13
  1225. je _zero_cipher_left_decrypt
  1226. sub $64, %r13
  1227. je _four_cipher_left_decrypt
  1228. _decrypt_by_4:
  1229. GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1230. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  1231. add $64, %r11
  1232. sub $64, %r13
  1233. jne _decrypt_by_4
  1234. _four_cipher_left_decrypt:
  1235. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1236. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1237. _zero_cipher_left_decrypt:
  1238. mov %arg4, %r13
  1239. and $15, %r13 # %r13 = arg4 (mod 16)
  1240. je _multiple_of_16_bytes_decrypt
  1241. # Handle the last <16 byte block separately
  1242. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  1243. movdqa SHUF_MASK(%rip), %xmm10
  1244. PSHUFB_XMM %xmm10, %xmm0
  1245. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  1246. sub $16, %r11
  1247. add %r13, %r11
  1248. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
  1249. lea SHIFT_MASK+16(%rip), %r12
  1250. sub %r13, %r12
  1251. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  1252. # (%r13 is the number of bytes in plaintext mod 16)
  1253. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1254. PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
  1255. movdqa %xmm1, %xmm2
  1256. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  1257. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1258. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  1259. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  1260. pand %xmm1, %xmm2
  1261. movdqa SHUF_MASK(%rip), %xmm10
  1262. PSHUFB_XMM %xmm10 ,%xmm2
  1263. pxor %xmm2, %xmm8
  1264. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1265. # GHASH computation for the last <16 byte block
  1266. sub %r13, %r11
  1267. add $16, %r11
  1268. # output %r13 bytes
  1269. MOVQ_R64_XMM %xmm0, %rax
  1270. cmp $8, %r13
  1271. jle _less_than_8_bytes_left_decrypt
  1272. mov %rax, (%arg2 , %r11, 1)
  1273. add $8, %r11
  1274. psrldq $8, %xmm0
  1275. MOVQ_R64_XMM %xmm0, %rax
  1276. sub $8, %r13
  1277. _less_than_8_bytes_left_decrypt:
  1278. mov %al, (%arg2, %r11, 1)
  1279. add $1, %r11
  1280. shr $8, %rax
  1281. sub $1, %r13
  1282. jne _less_than_8_bytes_left_decrypt
  1283. _multiple_of_16_bytes_decrypt:
  1284. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  1285. shl $3, %r12 # convert into number of bits
  1286. movd %r12d, %xmm15 # len(A) in %xmm15
  1287. shl $3, %arg4 # len(C) in bits (*128)
  1288. MOVQ_R64_XMM %arg4, %xmm1
  1289. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1290. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1291. pxor %xmm15, %xmm8
  1292. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1293. # final GHASH computation
  1294. movdqa SHUF_MASK(%rip), %xmm10
  1295. PSHUFB_XMM %xmm10, %xmm8
  1296. mov %arg5, %rax # %rax = *Y0
  1297. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1298. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  1299. pxor %xmm8, %xmm0
  1300. _return_T_decrypt:
  1301. mov arg9, %r10 # %r10 = authTag
  1302. mov arg10, %r11 # %r11 = auth_tag_len
  1303. cmp $16, %r11
  1304. je _T_16_decrypt
  1305. cmp $12, %r11
  1306. je _T_12_decrypt
  1307. _T_8_decrypt:
  1308. MOVQ_R64_XMM %xmm0, %rax
  1309. mov %rax, (%r10)
  1310. jmp _return_T_done_decrypt
  1311. _T_12_decrypt:
  1312. MOVQ_R64_XMM %xmm0, %rax
  1313. mov %rax, (%r10)
  1314. psrldq $8, %xmm0
  1315. movd %xmm0, %eax
  1316. mov %eax, 8(%r10)
  1317. jmp _return_T_done_decrypt
  1318. _T_16_decrypt:
  1319. movdqu %xmm0, (%r10)
  1320. _return_T_done_decrypt:
  1321. mov %r14, %rsp
  1322. pop %r14
  1323. pop %r13
  1324. pop %r12
  1325. ret
  1326. ENDPROC(aesni_gcm_dec)
  1327. /*****************************************************************************
  1328. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1329. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1330. * const u8 *in, // Plaintext input
  1331. * u64 plaintext_len, // Length of data in bytes for encryption.
  1332. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1333. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1334. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1335. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1336. * const u8 *aad, // Additional Authentication Data (AAD)
  1337. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1338. * u8 *auth_tag, // Authenticated Tag output.
  1339. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1340. * // 12 or 8.
  1341. *
  1342. * Assumptions:
  1343. *
  1344. * keys:
  1345. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1346. * first set of 11 keys in the data structure void *aes_ctx
  1347. *
  1348. *
  1349. * iv:
  1350. * 0 1 2 3
  1351. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1352. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1353. * | Salt (From the SA) |
  1354. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1355. * | Initialization Vector |
  1356. * | (This is the sequence number from IPSec header) |
  1357. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1358. * | 0x1 |
  1359. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1360. *
  1361. *
  1362. *
  1363. * AAD:
  1364. * AAD padded to 128 bits with 0
  1365. * for example, assume AAD is a u32 vector
  1366. *
  1367. * if AAD is 8 bytes:
  1368. * AAD[3] = {A0, A1};
  1369. * padded AAD in xmm register = {A1 A0 0 0}
  1370. *
  1371. * 0 1 2 3
  1372. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1373. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1374. * | SPI (A1) |
  1375. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1376. * | 32-bit Sequence Number (A0) |
  1377. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1378. * | 0x0 |
  1379. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1380. *
  1381. * AAD Format with 32-bit Sequence Number
  1382. *
  1383. * if AAD is 12 bytes:
  1384. * AAD[3] = {A0, A1, A2};
  1385. * padded AAD in xmm register = {A2 A1 A0 0}
  1386. *
  1387. * 0 1 2 3
  1388. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1389. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1390. * | SPI (A2) |
  1391. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1392. * | 64-bit Extended Sequence Number {A1,A0} |
  1393. * | |
  1394. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1395. * | 0x0 |
  1396. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1397. *
  1398. * AAD Format with 64-bit Extended Sequence Number
  1399. *
  1400. * aadLen:
  1401. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1402. * The code supports 16 too but for other sizes, the code will fail.
  1403. *
  1404. * TLen:
  1405. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1406. * For other sizes, the code will fail.
  1407. *
  1408. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1409. ***************************************************************************/
  1410. ENTRY(aesni_gcm_enc)
  1411. push %r12
  1412. push %r13
  1413. push %r14
  1414. mov %rsp, %r14
  1415. #
  1416. # states of %xmm registers %xmm6:%xmm15 not saved
  1417. # all %xmm registers are clobbered
  1418. #
  1419. sub $VARIABLE_OFFSET, %rsp
  1420. and $~63, %rsp
  1421. mov %arg6, %r12
  1422. movdqu (%r12), %xmm13
  1423. movdqa SHUF_MASK(%rip), %xmm2
  1424. PSHUFB_XMM %xmm2, %xmm13
  1425. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1426. movdqa %xmm13, %xmm2
  1427. psllq $1, %xmm13
  1428. psrlq $63, %xmm2
  1429. movdqa %xmm2, %xmm1
  1430. pslldq $8, %xmm2
  1431. psrldq $8, %xmm1
  1432. por %xmm2, %xmm13
  1433. # reduce HashKey<<1
  1434. pshufd $0x24, %xmm1, %xmm2
  1435. pcmpeqd TWOONE(%rip), %xmm2
  1436. pand POLY(%rip), %xmm2
  1437. pxor %xmm2, %xmm13
  1438. movdqa %xmm13, HashKey(%rsp)
  1439. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1440. and $-16, %r13
  1441. mov %r13, %r12
  1442. # Encrypt first few blocks
  1443. and $(3<<4), %r12
  1444. jz _initial_num_blocks_is_0_encrypt
  1445. cmp $(2<<4), %r12
  1446. jb _initial_num_blocks_is_1_encrypt
  1447. je _initial_num_blocks_is_2_encrypt
  1448. _initial_num_blocks_is_3_encrypt:
  1449. INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1450. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1451. sub $48, %r13
  1452. jmp _initial_blocks_encrypted
  1453. _initial_num_blocks_is_2_encrypt:
  1454. INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1455. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1456. sub $32, %r13
  1457. jmp _initial_blocks_encrypted
  1458. _initial_num_blocks_is_1_encrypt:
  1459. INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1460. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1461. sub $16, %r13
  1462. jmp _initial_blocks_encrypted
  1463. _initial_num_blocks_is_0_encrypt:
  1464. INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1465. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1466. _initial_blocks_encrypted:
  1467. # Main loop - Encrypt remaining blocks
  1468. cmp $0, %r13
  1469. je _zero_cipher_left_encrypt
  1470. sub $64, %r13
  1471. je _four_cipher_left_encrypt
  1472. _encrypt_by_4_encrypt:
  1473. GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1474. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1475. add $64, %r11
  1476. sub $64, %r13
  1477. jne _encrypt_by_4_encrypt
  1478. _four_cipher_left_encrypt:
  1479. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1480. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1481. _zero_cipher_left_encrypt:
  1482. mov %arg4, %r13
  1483. and $15, %r13 # %r13 = arg4 (mod 16)
  1484. je _multiple_of_16_bytes_encrypt
  1485. # Handle the last <16 Byte block separately
  1486. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1487. movdqa SHUF_MASK(%rip), %xmm10
  1488. PSHUFB_XMM %xmm10, %xmm0
  1489. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1490. sub $16, %r11
  1491. add %r13, %r11
  1492. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1493. lea SHIFT_MASK+16(%rip), %r12
  1494. sub %r13, %r12
  1495. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1496. # (%r13 is the number of bytes in plaintext mod 16)
  1497. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1498. PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
  1499. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1500. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1501. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1502. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1503. movdqa SHUF_MASK(%rip), %xmm10
  1504. PSHUFB_XMM %xmm10,%xmm0
  1505. pxor %xmm0, %xmm8
  1506. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1507. # GHASH computation for the last <16 byte block
  1508. sub %r13, %r11
  1509. add $16, %r11
  1510. movdqa SHUF_MASK(%rip), %xmm10
  1511. PSHUFB_XMM %xmm10, %xmm0
  1512. # shuffle xmm0 back to output as ciphertext
  1513. # Output %r13 bytes
  1514. MOVQ_R64_XMM %xmm0, %rax
  1515. cmp $8, %r13
  1516. jle _less_than_8_bytes_left_encrypt
  1517. mov %rax, (%arg2 , %r11, 1)
  1518. add $8, %r11
  1519. psrldq $8, %xmm0
  1520. MOVQ_R64_XMM %xmm0, %rax
  1521. sub $8, %r13
  1522. _less_than_8_bytes_left_encrypt:
  1523. mov %al, (%arg2, %r11, 1)
  1524. add $1, %r11
  1525. shr $8, %rax
  1526. sub $1, %r13
  1527. jne _less_than_8_bytes_left_encrypt
  1528. _multiple_of_16_bytes_encrypt:
  1529. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1530. shl $3, %r12
  1531. movd %r12d, %xmm15 # len(A) in %xmm15
  1532. shl $3, %arg4 # len(C) in bits (*128)
  1533. MOVQ_R64_XMM %arg4, %xmm1
  1534. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1535. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1536. pxor %xmm15, %xmm8
  1537. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1538. # final GHASH computation
  1539. movdqa SHUF_MASK(%rip), %xmm10
  1540. PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
  1541. mov %arg5, %rax # %rax = *Y0
  1542. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1543. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1544. pxor %xmm8, %xmm0
  1545. _return_T_encrypt:
  1546. mov arg9, %r10 # %r10 = authTag
  1547. mov arg10, %r11 # %r11 = auth_tag_len
  1548. cmp $16, %r11
  1549. je _T_16_encrypt
  1550. cmp $12, %r11
  1551. je _T_12_encrypt
  1552. _T_8_encrypt:
  1553. MOVQ_R64_XMM %xmm0, %rax
  1554. mov %rax, (%r10)
  1555. jmp _return_T_done_encrypt
  1556. _T_12_encrypt:
  1557. MOVQ_R64_XMM %xmm0, %rax
  1558. mov %rax, (%r10)
  1559. psrldq $8, %xmm0
  1560. movd %xmm0, %eax
  1561. mov %eax, 8(%r10)
  1562. jmp _return_T_done_encrypt
  1563. _T_16_encrypt:
  1564. movdqu %xmm0, (%r10)
  1565. _return_T_done_encrypt:
  1566. mov %r14, %rsp
  1567. pop %r14
  1568. pop %r13
  1569. pop %r12
  1570. ret
  1571. ENDPROC(aesni_gcm_enc)
  1572. #endif
  1573. .align 4
  1574. _key_expansion_128:
  1575. _key_expansion_256a:
  1576. pshufd $0b11111111, %xmm1, %xmm1
  1577. shufps $0b00010000, %xmm0, %xmm4
  1578. pxor %xmm4, %xmm0
  1579. shufps $0b10001100, %xmm0, %xmm4
  1580. pxor %xmm4, %xmm0
  1581. pxor %xmm1, %xmm0
  1582. movaps %xmm0, (TKEYP)
  1583. add $0x10, TKEYP
  1584. ret
  1585. ENDPROC(_key_expansion_128)
  1586. ENDPROC(_key_expansion_256a)
  1587. .align 4
  1588. _key_expansion_192a:
  1589. pshufd $0b01010101, %xmm1, %xmm1
  1590. shufps $0b00010000, %xmm0, %xmm4
  1591. pxor %xmm4, %xmm0
  1592. shufps $0b10001100, %xmm0, %xmm4
  1593. pxor %xmm4, %xmm0
  1594. pxor %xmm1, %xmm0
  1595. movaps %xmm2, %xmm5
  1596. movaps %xmm2, %xmm6
  1597. pslldq $4, %xmm5
  1598. pshufd $0b11111111, %xmm0, %xmm3
  1599. pxor %xmm3, %xmm2
  1600. pxor %xmm5, %xmm2
  1601. movaps %xmm0, %xmm1
  1602. shufps $0b01000100, %xmm0, %xmm6
  1603. movaps %xmm6, (TKEYP)
  1604. shufps $0b01001110, %xmm2, %xmm1
  1605. movaps %xmm1, 0x10(TKEYP)
  1606. add $0x20, TKEYP
  1607. ret
  1608. ENDPROC(_key_expansion_192a)
  1609. .align 4
  1610. _key_expansion_192b:
  1611. pshufd $0b01010101, %xmm1, %xmm1
  1612. shufps $0b00010000, %xmm0, %xmm4
  1613. pxor %xmm4, %xmm0
  1614. shufps $0b10001100, %xmm0, %xmm4
  1615. pxor %xmm4, %xmm0
  1616. pxor %xmm1, %xmm0
  1617. movaps %xmm2, %xmm5
  1618. pslldq $4, %xmm5
  1619. pshufd $0b11111111, %xmm0, %xmm3
  1620. pxor %xmm3, %xmm2
  1621. pxor %xmm5, %xmm2
  1622. movaps %xmm0, (TKEYP)
  1623. add $0x10, TKEYP
  1624. ret
  1625. ENDPROC(_key_expansion_192b)
  1626. .align 4
  1627. _key_expansion_256b:
  1628. pshufd $0b10101010, %xmm1, %xmm1
  1629. shufps $0b00010000, %xmm2, %xmm4
  1630. pxor %xmm4, %xmm2
  1631. shufps $0b10001100, %xmm2, %xmm4
  1632. pxor %xmm4, %xmm2
  1633. pxor %xmm1, %xmm2
  1634. movaps %xmm2, (TKEYP)
  1635. add $0x10, TKEYP
  1636. ret
  1637. ENDPROC(_key_expansion_256b)
  1638. /*
  1639. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1640. * unsigned int key_len)
  1641. */
  1642. ENTRY(aesni_set_key)
  1643. #ifndef __x86_64__
  1644. pushl KEYP
  1645. movl 8(%esp), KEYP # ctx
  1646. movl 12(%esp), UKEYP # in_key
  1647. movl 16(%esp), %edx # key_len
  1648. #endif
  1649. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1650. movaps %xmm0, (KEYP)
  1651. lea 0x10(KEYP), TKEYP # key addr
  1652. movl %edx, 480(KEYP)
  1653. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1654. cmp $24, %dl
  1655. jb .Lenc_key128
  1656. je .Lenc_key192
  1657. movups 0x10(UKEYP), %xmm2 # other user key
  1658. movaps %xmm2, (TKEYP)
  1659. add $0x10, TKEYP
  1660. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1661. call _key_expansion_256a
  1662. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1663. call _key_expansion_256b
  1664. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1665. call _key_expansion_256a
  1666. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1667. call _key_expansion_256b
  1668. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1669. call _key_expansion_256a
  1670. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1671. call _key_expansion_256b
  1672. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1673. call _key_expansion_256a
  1674. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1675. call _key_expansion_256b
  1676. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1677. call _key_expansion_256a
  1678. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1679. call _key_expansion_256b
  1680. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1681. call _key_expansion_256a
  1682. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1683. call _key_expansion_256b
  1684. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1685. call _key_expansion_256a
  1686. jmp .Ldec_key
  1687. .Lenc_key192:
  1688. movq 0x10(UKEYP), %xmm2 # other user key
  1689. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1690. call _key_expansion_192a
  1691. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1692. call _key_expansion_192b
  1693. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1694. call _key_expansion_192a
  1695. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1696. call _key_expansion_192b
  1697. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1698. call _key_expansion_192a
  1699. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1700. call _key_expansion_192b
  1701. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1702. call _key_expansion_192a
  1703. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1704. call _key_expansion_192b
  1705. jmp .Ldec_key
  1706. .Lenc_key128:
  1707. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1708. call _key_expansion_128
  1709. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1710. call _key_expansion_128
  1711. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1712. call _key_expansion_128
  1713. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1714. call _key_expansion_128
  1715. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1716. call _key_expansion_128
  1717. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1718. call _key_expansion_128
  1719. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1720. call _key_expansion_128
  1721. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1722. call _key_expansion_128
  1723. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1724. call _key_expansion_128
  1725. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1726. call _key_expansion_128
  1727. .Ldec_key:
  1728. sub $0x10, TKEYP
  1729. movaps (KEYP), %xmm0
  1730. movaps (TKEYP), %xmm1
  1731. movaps %xmm0, 240(TKEYP)
  1732. movaps %xmm1, 240(KEYP)
  1733. add $0x10, KEYP
  1734. lea 240-16(TKEYP), UKEYP
  1735. .align 4
  1736. .Ldec_key_loop:
  1737. movaps (KEYP), %xmm0
  1738. AESIMC %xmm0 %xmm1
  1739. movaps %xmm1, (UKEYP)
  1740. add $0x10, KEYP
  1741. sub $0x10, UKEYP
  1742. cmp TKEYP, KEYP
  1743. jb .Ldec_key_loop
  1744. xor AREG, AREG
  1745. #ifndef __x86_64__
  1746. popl KEYP
  1747. #endif
  1748. ret
  1749. ENDPROC(aesni_set_key)
  1750. /*
  1751. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1752. */
  1753. ENTRY(aesni_enc)
  1754. #ifndef __x86_64__
  1755. pushl KEYP
  1756. pushl KLEN
  1757. movl 12(%esp), KEYP
  1758. movl 16(%esp), OUTP
  1759. movl 20(%esp), INP
  1760. #endif
  1761. movl 480(KEYP), KLEN # key length
  1762. movups (INP), STATE # input
  1763. call _aesni_enc1
  1764. movups STATE, (OUTP) # output
  1765. #ifndef __x86_64__
  1766. popl KLEN
  1767. popl KEYP
  1768. #endif
  1769. ret
  1770. ENDPROC(aesni_enc)
  1771. /*
  1772. * _aesni_enc1: internal ABI
  1773. * input:
  1774. * KEYP: key struct pointer
  1775. * KLEN: round count
  1776. * STATE: initial state (input)
  1777. * output:
  1778. * STATE: finial state (output)
  1779. * changed:
  1780. * KEY
  1781. * TKEYP (T1)
  1782. */
  1783. .align 4
  1784. _aesni_enc1:
  1785. movaps (KEYP), KEY # key
  1786. mov KEYP, TKEYP
  1787. pxor KEY, STATE # round 0
  1788. add $0x30, TKEYP
  1789. cmp $24, KLEN
  1790. jb .Lenc128
  1791. lea 0x20(TKEYP), TKEYP
  1792. je .Lenc192
  1793. add $0x20, TKEYP
  1794. movaps -0x60(TKEYP), KEY
  1795. AESENC KEY STATE
  1796. movaps -0x50(TKEYP), KEY
  1797. AESENC KEY STATE
  1798. .align 4
  1799. .Lenc192:
  1800. movaps -0x40(TKEYP), KEY
  1801. AESENC KEY STATE
  1802. movaps -0x30(TKEYP), KEY
  1803. AESENC KEY STATE
  1804. .align 4
  1805. .Lenc128:
  1806. movaps -0x20(TKEYP), KEY
  1807. AESENC KEY STATE
  1808. movaps -0x10(TKEYP), KEY
  1809. AESENC KEY STATE
  1810. movaps (TKEYP), KEY
  1811. AESENC KEY STATE
  1812. movaps 0x10(TKEYP), KEY
  1813. AESENC KEY STATE
  1814. movaps 0x20(TKEYP), KEY
  1815. AESENC KEY STATE
  1816. movaps 0x30(TKEYP), KEY
  1817. AESENC KEY STATE
  1818. movaps 0x40(TKEYP), KEY
  1819. AESENC KEY STATE
  1820. movaps 0x50(TKEYP), KEY
  1821. AESENC KEY STATE
  1822. movaps 0x60(TKEYP), KEY
  1823. AESENC KEY STATE
  1824. movaps 0x70(TKEYP), KEY
  1825. AESENCLAST KEY STATE
  1826. ret
  1827. ENDPROC(_aesni_enc1)
  1828. /*
  1829. * _aesni_enc4: internal ABI
  1830. * input:
  1831. * KEYP: key struct pointer
  1832. * KLEN: round count
  1833. * STATE1: initial state (input)
  1834. * STATE2
  1835. * STATE3
  1836. * STATE4
  1837. * output:
  1838. * STATE1: finial state (output)
  1839. * STATE2
  1840. * STATE3
  1841. * STATE4
  1842. * changed:
  1843. * KEY
  1844. * TKEYP (T1)
  1845. */
  1846. .align 4
  1847. _aesni_enc4:
  1848. movaps (KEYP), KEY # key
  1849. mov KEYP, TKEYP
  1850. pxor KEY, STATE1 # round 0
  1851. pxor KEY, STATE2
  1852. pxor KEY, STATE3
  1853. pxor KEY, STATE4
  1854. add $0x30, TKEYP
  1855. cmp $24, KLEN
  1856. jb .L4enc128
  1857. lea 0x20(TKEYP), TKEYP
  1858. je .L4enc192
  1859. add $0x20, TKEYP
  1860. movaps -0x60(TKEYP), KEY
  1861. AESENC KEY STATE1
  1862. AESENC KEY STATE2
  1863. AESENC KEY STATE3
  1864. AESENC KEY STATE4
  1865. movaps -0x50(TKEYP), KEY
  1866. AESENC KEY STATE1
  1867. AESENC KEY STATE2
  1868. AESENC KEY STATE3
  1869. AESENC KEY STATE4
  1870. #.align 4
  1871. .L4enc192:
  1872. movaps -0x40(TKEYP), KEY
  1873. AESENC KEY STATE1
  1874. AESENC KEY STATE2
  1875. AESENC KEY STATE3
  1876. AESENC KEY STATE4
  1877. movaps -0x30(TKEYP), KEY
  1878. AESENC KEY STATE1
  1879. AESENC KEY STATE2
  1880. AESENC KEY STATE3
  1881. AESENC KEY STATE4
  1882. #.align 4
  1883. .L4enc128:
  1884. movaps -0x20(TKEYP), KEY
  1885. AESENC KEY STATE1
  1886. AESENC KEY STATE2
  1887. AESENC KEY STATE3
  1888. AESENC KEY STATE4
  1889. movaps -0x10(TKEYP), KEY
  1890. AESENC KEY STATE1
  1891. AESENC KEY STATE2
  1892. AESENC KEY STATE3
  1893. AESENC KEY STATE4
  1894. movaps (TKEYP), KEY
  1895. AESENC KEY STATE1
  1896. AESENC KEY STATE2
  1897. AESENC KEY STATE3
  1898. AESENC KEY STATE4
  1899. movaps 0x10(TKEYP), KEY
  1900. AESENC KEY STATE1
  1901. AESENC KEY STATE2
  1902. AESENC KEY STATE3
  1903. AESENC KEY STATE4
  1904. movaps 0x20(TKEYP), KEY
  1905. AESENC KEY STATE1
  1906. AESENC KEY STATE2
  1907. AESENC KEY STATE3
  1908. AESENC KEY STATE4
  1909. movaps 0x30(TKEYP), KEY
  1910. AESENC KEY STATE1
  1911. AESENC KEY STATE2
  1912. AESENC KEY STATE3
  1913. AESENC KEY STATE4
  1914. movaps 0x40(TKEYP), KEY
  1915. AESENC KEY STATE1
  1916. AESENC KEY STATE2
  1917. AESENC KEY STATE3
  1918. AESENC KEY STATE4
  1919. movaps 0x50(TKEYP), KEY
  1920. AESENC KEY STATE1
  1921. AESENC KEY STATE2
  1922. AESENC KEY STATE3
  1923. AESENC KEY STATE4
  1924. movaps 0x60(TKEYP), KEY
  1925. AESENC KEY STATE1
  1926. AESENC KEY STATE2
  1927. AESENC KEY STATE3
  1928. AESENC KEY STATE4
  1929. movaps 0x70(TKEYP), KEY
  1930. AESENCLAST KEY STATE1 # last round
  1931. AESENCLAST KEY STATE2
  1932. AESENCLAST KEY STATE3
  1933. AESENCLAST KEY STATE4
  1934. ret
  1935. ENDPROC(_aesni_enc4)
  1936. /*
  1937. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1938. */
  1939. ENTRY(aesni_dec)
  1940. #ifndef __x86_64__
  1941. pushl KEYP
  1942. pushl KLEN
  1943. movl 12(%esp), KEYP
  1944. movl 16(%esp), OUTP
  1945. movl 20(%esp), INP
  1946. #endif
  1947. mov 480(KEYP), KLEN # key length
  1948. add $240, KEYP
  1949. movups (INP), STATE # input
  1950. call _aesni_dec1
  1951. movups STATE, (OUTP) #output
  1952. #ifndef __x86_64__
  1953. popl KLEN
  1954. popl KEYP
  1955. #endif
  1956. ret
  1957. ENDPROC(aesni_dec)
  1958. /*
  1959. * _aesni_dec1: internal ABI
  1960. * input:
  1961. * KEYP: key struct pointer
  1962. * KLEN: key length
  1963. * STATE: initial state (input)
  1964. * output:
  1965. * STATE: finial state (output)
  1966. * changed:
  1967. * KEY
  1968. * TKEYP (T1)
  1969. */
  1970. .align 4
  1971. _aesni_dec1:
  1972. movaps (KEYP), KEY # key
  1973. mov KEYP, TKEYP
  1974. pxor KEY, STATE # round 0
  1975. add $0x30, TKEYP
  1976. cmp $24, KLEN
  1977. jb .Ldec128
  1978. lea 0x20(TKEYP), TKEYP
  1979. je .Ldec192
  1980. add $0x20, TKEYP
  1981. movaps -0x60(TKEYP), KEY
  1982. AESDEC KEY STATE
  1983. movaps -0x50(TKEYP), KEY
  1984. AESDEC KEY STATE
  1985. .align 4
  1986. .Ldec192:
  1987. movaps -0x40(TKEYP), KEY
  1988. AESDEC KEY STATE
  1989. movaps -0x30(TKEYP), KEY
  1990. AESDEC KEY STATE
  1991. .align 4
  1992. .Ldec128:
  1993. movaps -0x20(TKEYP), KEY
  1994. AESDEC KEY STATE
  1995. movaps -0x10(TKEYP), KEY
  1996. AESDEC KEY STATE
  1997. movaps (TKEYP), KEY
  1998. AESDEC KEY STATE
  1999. movaps 0x10(TKEYP), KEY
  2000. AESDEC KEY STATE
  2001. movaps 0x20(TKEYP), KEY
  2002. AESDEC KEY STATE
  2003. movaps 0x30(TKEYP), KEY
  2004. AESDEC KEY STATE
  2005. movaps 0x40(TKEYP), KEY
  2006. AESDEC KEY STATE
  2007. movaps 0x50(TKEYP), KEY
  2008. AESDEC KEY STATE
  2009. movaps 0x60(TKEYP), KEY
  2010. AESDEC KEY STATE
  2011. movaps 0x70(TKEYP), KEY
  2012. AESDECLAST KEY STATE
  2013. ret
  2014. ENDPROC(_aesni_dec1)
  2015. /*
  2016. * _aesni_dec4: internal ABI
  2017. * input:
  2018. * KEYP: key struct pointer
  2019. * KLEN: key length
  2020. * STATE1: initial state (input)
  2021. * STATE2
  2022. * STATE3
  2023. * STATE4
  2024. * output:
  2025. * STATE1: finial state (output)
  2026. * STATE2
  2027. * STATE3
  2028. * STATE4
  2029. * changed:
  2030. * KEY
  2031. * TKEYP (T1)
  2032. */
  2033. .align 4
  2034. _aesni_dec4:
  2035. movaps (KEYP), KEY # key
  2036. mov KEYP, TKEYP
  2037. pxor KEY, STATE1 # round 0
  2038. pxor KEY, STATE2
  2039. pxor KEY, STATE3
  2040. pxor KEY, STATE4
  2041. add $0x30, TKEYP
  2042. cmp $24, KLEN
  2043. jb .L4dec128
  2044. lea 0x20(TKEYP), TKEYP
  2045. je .L4dec192
  2046. add $0x20, TKEYP
  2047. movaps -0x60(TKEYP), KEY
  2048. AESDEC KEY STATE1
  2049. AESDEC KEY STATE2
  2050. AESDEC KEY STATE3
  2051. AESDEC KEY STATE4
  2052. movaps -0x50(TKEYP), KEY
  2053. AESDEC KEY STATE1
  2054. AESDEC KEY STATE2
  2055. AESDEC KEY STATE3
  2056. AESDEC KEY STATE4
  2057. .align 4
  2058. .L4dec192:
  2059. movaps -0x40(TKEYP), KEY
  2060. AESDEC KEY STATE1
  2061. AESDEC KEY STATE2
  2062. AESDEC KEY STATE3
  2063. AESDEC KEY STATE4
  2064. movaps -0x30(TKEYP), KEY
  2065. AESDEC KEY STATE1
  2066. AESDEC KEY STATE2
  2067. AESDEC KEY STATE3
  2068. AESDEC KEY STATE4
  2069. .align 4
  2070. .L4dec128:
  2071. movaps -0x20(TKEYP), KEY
  2072. AESDEC KEY STATE1
  2073. AESDEC KEY STATE2
  2074. AESDEC KEY STATE3
  2075. AESDEC KEY STATE4
  2076. movaps -0x10(TKEYP), KEY
  2077. AESDEC KEY STATE1
  2078. AESDEC KEY STATE2
  2079. AESDEC KEY STATE3
  2080. AESDEC KEY STATE4
  2081. movaps (TKEYP), KEY
  2082. AESDEC KEY STATE1
  2083. AESDEC KEY STATE2
  2084. AESDEC KEY STATE3
  2085. AESDEC KEY STATE4
  2086. movaps 0x10(TKEYP), KEY
  2087. AESDEC KEY STATE1
  2088. AESDEC KEY STATE2
  2089. AESDEC KEY STATE3
  2090. AESDEC KEY STATE4
  2091. movaps 0x20(TKEYP), KEY
  2092. AESDEC KEY STATE1
  2093. AESDEC KEY STATE2
  2094. AESDEC KEY STATE3
  2095. AESDEC KEY STATE4
  2096. movaps 0x30(TKEYP), KEY
  2097. AESDEC KEY STATE1
  2098. AESDEC KEY STATE2
  2099. AESDEC KEY STATE3
  2100. AESDEC KEY STATE4
  2101. movaps 0x40(TKEYP), KEY
  2102. AESDEC KEY STATE1
  2103. AESDEC KEY STATE2
  2104. AESDEC KEY STATE3
  2105. AESDEC KEY STATE4
  2106. movaps 0x50(TKEYP), KEY
  2107. AESDEC KEY STATE1
  2108. AESDEC KEY STATE2
  2109. AESDEC KEY STATE3
  2110. AESDEC KEY STATE4
  2111. movaps 0x60(TKEYP), KEY
  2112. AESDEC KEY STATE1
  2113. AESDEC KEY STATE2
  2114. AESDEC KEY STATE3
  2115. AESDEC KEY STATE4
  2116. movaps 0x70(TKEYP), KEY
  2117. AESDECLAST KEY STATE1 # last round
  2118. AESDECLAST KEY STATE2
  2119. AESDECLAST KEY STATE3
  2120. AESDECLAST KEY STATE4
  2121. ret
  2122. ENDPROC(_aesni_dec4)
  2123. /*
  2124. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2125. * size_t len)
  2126. */
  2127. ENTRY(aesni_ecb_enc)
  2128. #ifndef __x86_64__
  2129. pushl LEN
  2130. pushl KEYP
  2131. pushl KLEN
  2132. movl 16(%esp), KEYP
  2133. movl 20(%esp), OUTP
  2134. movl 24(%esp), INP
  2135. movl 28(%esp), LEN
  2136. #endif
  2137. test LEN, LEN # check length
  2138. jz .Lecb_enc_ret
  2139. mov 480(KEYP), KLEN
  2140. cmp $16, LEN
  2141. jb .Lecb_enc_ret
  2142. cmp $64, LEN
  2143. jb .Lecb_enc_loop1
  2144. .align 4
  2145. .Lecb_enc_loop4:
  2146. movups (INP), STATE1
  2147. movups 0x10(INP), STATE2
  2148. movups 0x20(INP), STATE3
  2149. movups 0x30(INP), STATE4
  2150. call _aesni_enc4
  2151. movups STATE1, (OUTP)
  2152. movups STATE2, 0x10(OUTP)
  2153. movups STATE3, 0x20(OUTP)
  2154. movups STATE4, 0x30(OUTP)
  2155. sub $64, LEN
  2156. add $64, INP
  2157. add $64, OUTP
  2158. cmp $64, LEN
  2159. jge .Lecb_enc_loop4
  2160. cmp $16, LEN
  2161. jb .Lecb_enc_ret
  2162. .align 4
  2163. .Lecb_enc_loop1:
  2164. movups (INP), STATE1
  2165. call _aesni_enc1
  2166. movups STATE1, (OUTP)
  2167. sub $16, LEN
  2168. add $16, INP
  2169. add $16, OUTP
  2170. cmp $16, LEN
  2171. jge .Lecb_enc_loop1
  2172. .Lecb_enc_ret:
  2173. #ifndef __x86_64__
  2174. popl KLEN
  2175. popl KEYP
  2176. popl LEN
  2177. #endif
  2178. ret
  2179. ENDPROC(aesni_ecb_enc)
  2180. /*
  2181. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2182. * size_t len);
  2183. */
  2184. ENTRY(aesni_ecb_dec)
  2185. #ifndef __x86_64__
  2186. pushl LEN
  2187. pushl KEYP
  2188. pushl KLEN
  2189. movl 16(%esp), KEYP
  2190. movl 20(%esp), OUTP
  2191. movl 24(%esp), INP
  2192. movl 28(%esp), LEN
  2193. #endif
  2194. test LEN, LEN
  2195. jz .Lecb_dec_ret
  2196. mov 480(KEYP), KLEN
  2197. add $240, KEYP
  2198. cmp $16, LEN
  2199. jb .Lecb_dec_ret
  2200. cmp $64, LEN
  2201. jb .Lecb_dec_loop1
  2202. .align 4
  2203. .Lecb_dec_loop4:
  2204. movups (INP), STATE1
  2205. movups 0x10(INP), STATE2
  2206. movups 0x20(INP), STATE3
  2207. movups 0x30(INP), STATE4
  2208. call _aesni_dec4
  2209. movups STATE1, (OUTP)
  2210. movups STATE2, 0x10(OUTP)
  2211. movups STATE3, 0x20(OUTP)
  2212. movups STATE4, 0x30(OUTP)
  2213. sub $64, LEN
  2214. add $64, INP
  2215. add $64, OUTP
  2216. cmp $64, LEN
  2217. jge .Lecb_dec_loop4
  2218. cmp $16, LEN
  2219. jb .Lecb_dec_ret
  2220. .align 4
  2221. .Lecb_dec_loop1:
  2222. movups (INP), STATE1
  2223. call _aesni_dec1
  2224. movups STATE1, (OUTP)
  2225. sub $16, LEN
  2226. add $16, INP
  2227. add $16, OUTP
  2228. cmp $16, LEN
  2229. jge .Lecb_dec_loop1
  2230. .Lecb_dec_ret:
  2231. #ifndef __x86_64__
  2232. popl KLEN
  2233. popl KEYP
  2234. popl LEN
  2235. #endif
  2236. ret
  2237. ENDPROC(aesni_ecb_dec)
  2238. /*
  2239. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2240. * size_t len, u8 *iv)
  2241. */
  2242. ENTRY(aesni_cbc_enc)
  2243. #ifndef __x86_64__
  2244. pushl IVP
  2245. pushl LEN
  2246. pushl KEYP
  2247. pushl KLEN
  2248. movl 20(%esp), KEYP
  2249. movl 24(%esp), OUTP
  2250. movl 28(%esp), INP
  2251. movl 32(%esp), LEN
  2252. movl 36(%esp), IVP
  2253. #endif
  2254. cmp $16, LEN
  2255. jb .Lcbc_enc_ret
  2256. mov 480(KEYP), KLEN
  2257. movups (IVP), STATE # load iv as initial state
  2258. .align 4
  2259. .Lcbc_enc_loop:
  2260. movups (INP), IN # load input
  2261. pxor IN, STATE
  2262. call _aesni_enc1
  2263. movups STATE, (OUTP) # store output
  2264. sub $16, LEN
  2265. add $16, INP
  2266. add $16, OUTP
  2267. cmp $16, LEN
  2268. jge .Lcbc_enc_loop
  2269. movups STATE, (IVP)
  2270. .Lcbc_enc_ret:
  2271. #ifndef __x86_64__
  2272. popl KLEN
  2273. popl KEYP
  2274. popl LEN
  2275. popl IVP
  2276. #endif
  2277. ret
  2278. ENDPROC(aesni_cbc_enc)
  2279. /*
  2280. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2281. * size_t len, u8 *iv)
  2282. */
  2283. ENTRY(aesni_cbc_dec)
  2284. #ifndef __x86_64__
  2285. pushl IVP
  2286. pushl LEN
  2287. pushl KEYP
  2288. pushl KLEN
  2289. movl 20(%esp), KEYP
  2290. movl 24(%esp), OUTP
  2291. movl 28(%esp), INP
  2292. movl 32(%esp), LEN
  2293. movl 36(%esp), IVP
  2294. #endif
  2295. cmp $16, LEN
  2296. jb .Lcbc_dec_just_ret
  2297. mov 480(KEYP), KLEN
  2298. add $240, KEYP
  2299. movups (IVP), IV
  2300. cmp $64, LEN
  2301. jb .Lcbc_dec_loop1
  2302. .align 4
  2303. .Lcbc_dec_loop4:
  2304. movups (INP), IN1
  2305. movaps IN1, STATE1
  2306. movups 0x10(INP), IN2
  2307. movaps IN2, STATE2
  2308. #ifdef __x86_64__
  2309. movups 0x20(INP), IN3
  2310. movaps IN3, STATE3
  2311. movups 0x30(INP), IN4
  2312. movaps IN4, STATE4
  2313. #else
  2314. movups 0x20(INP), IN1
  2315. movaps IN1, STATE3
  2316. movups 0x30(INP), IN2
  2317. movaps IN2, STATE4
  2318. #endif
  2319. call _aesni_dec4
  2320. pxor IV, STATE1
  2321. #ifdef __x86_64__
  2322. pxor IN1, STATE2
  2323. pxor IN2, STATE3
  2324. pxor IN3, STATE4
  2325. movaps IN4, IV
  2326. #else
  2327. pxor IN1, STATE4
  2328. movaps IN2, IV
  2329. movups (INP), IN1
  2330. pxor IN1, STATE2
  2331. movups 0x10(INP), IN2
  2332. pxor IN2, STATE3
  2333. #endif
  2334. movups STATE1, (OUTP)
  2335. movups STATE2, 0x10(OUTP)
  2336. movups STATE3, 0x20(OUTP)
  2337. movups STATE4, 0x30(OUTP)
  2338. sub $64, LEN
  2339. add $64, INP
  2340. add $64, OUTP
  2341. cmp $64, LEN
  2342. jge .Lcbc_dec_loop4
  2343. cmp $16, LEN
  2344. jb .Lcbc_dec_ret
  2345. .align 4
  2346. .Lcbc_dec_loop1:
  2347. movups (INP), IN
  2348. movaps IN, STATE
  2349. call _aesni_dec1
  2350. pxor IV, STATE
  2351. movups STATE, (OUTP)
  2352. movaps IN, IV
  2353. sub $16, LEN
  2354. add $16, INP
  2355. add $16, OUTP
  2356. cmp $16, LEN
  2357. jge .Lcbc_dec_loop1
  2358. .Lcbc_dec_ret:
  2359. movups IV, (IVP)
  2360. .Lcbc_dec_just_ret:
  2361. #ifndef __x86_64__
  2362. popl KLEN
  2363. popl KEYP
  2364. popl LEN
  2365. popl IVP
  2366. #endif
  2367. ret
  2368. ENDPROC(aesni_cbc_dec)
  2369. #ifdef __x86_64__
  2370. .align 16
  2371. .Lbswap_mask:
  2372. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2373. /*
  2374. * _aesni_inc_init: internal ABI
  2375. * setup registers used by _aesni_inc
  2376. * input:
  2377. * IV
  2378. * output:
  2379. * CTR: == IV, in little endian
  2380. * TCTR_LOW: == lower qword of CTR
  2381. * INC: == 1, in little endian
  2382. * BSWAP_MASK == endian swapping mask
  2383. */
  2384. .align 4
  2385. _aesni_inc_init:
  2386. movaps .Lbswap_mask, BSWAP_MASK
  2387. movaps IV, CTR
  2388. PSHUFB_XMM BSWAP_MASK CTR
  2389. mov $1, TCTR_LOW
  2390. MOVQ_R64_XMM TCTR_LOW INC
  2391. MOVQ_R64_XMM CTR TCTR_LOW
  2392. ret
  2393. ENDPROC(_aesni_inc_init)
  2394. /*
  2395. * _aesni_inc: internal ABI
  2396. * Increase IV by 1, IV is in big endian
  2397. * input:
  2398. * IV
  2399. * CTR: == IV, in little endian
  2400. * TCTR_LOW: == lower qword of CTR
  2401. * INC: == 1, in little endian
  2402. * BSWAP_MASK == endian swapping mask
  2403. * output:
  2404. * IV: Increase by 1
  2405. * changed:
  2406. * CTR: == output IV, in little endian
  2407. * TCTR_LOW: == lower qword of CTR
  2408. */
  2409. .align 4
  2410. _aesni_inc:
  2411. paddq INC, CTR
  2412. add $1, TCTR_LOW
  2413. jnc .Linc_low
  2414. pslldq $8, INC
  2415. paddq INC, CTR
  2416. psrldq $8, INC
  2417. .Linc_low:
  2418. movaps CTR, IV
  2419. PSHUFB_XMM BSWAP_MASK IV
  2420. ret
  2421. ENDPROC(_aesni_inc)
  2422. /*
  2423. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2424. * size_t len, u8 *iv)
  2425. */
  2426. ENTRY(aesni_ctr_enc)
  2427. cmp $16, LEN
  2428. jb .Lctr_enc_just_ret
  2429. mov 480(KEYP), KLEN
  2430. movups (IVP), IV
  2431. call _aesni_inc_init
  2432. cmp $64, LEN
  2433. jb .Lctr_enc_loop1
  2434. .align 4
  2435. .Lctr_enc_loop4:
  2436. movaps IV, STATE1
  2437. call _aesni_inc
  2438. movups (INP), IN1
  2439. movaps IV, STATE2
  2440. call _aesni_inc
  2441. movups 0x10(INP), IN2
  2442. movaps IV, STATE3
  2443. call _aesni_inc
  2444. movups 0x20(INP), IN3
  2445. movaps IV, STATE4
  2446. call _aesni_inc
  2447. movups 0x30(INP), IN4
  2448. call _aesni_enc4
  2449. pxor IN1, STATE1
  2450. movups STATE1, (OUTP)
  2451. pxor IN2, STATE2
  2452. movups STATE2, 0x10(OUTP)
  2453. pxor IN3, STATE3
  2454. movups STATE3, 0x20(OUTP)
  2455. pxor IN4, STATE4
  2456. movups STATE4, 0x30(OUTP)
  2457. sub $64, LEN
  2458. add $64, INP
  2459. add $64, OUTP
  2460. cmp $64, LEN
  2461. jge .Lctr_enc_loop4
  2462. cmp $16, LEN
  2463. jb .Lctr_enc_ret
  2464. .align 4
  2465. .Lctr_enc_loop1:
  2466. movaps IV, STATE
  2467. call _aesni_inc
  2468. movups (INP), IN
  2469. call _aesni_enc1
  2470. pxor IN, STATE
  2471. movups STATE, (OUTP)
  2472. sub $16, LEN
  2473. add $16, INP
  2474. add $16, OUTP
  2475. cmp $16, LEN
  2476. jge .Lctr_enc_loop1
  2477. .Lctr_enc_ret:
  2478. movups IV, (IVP)
  2479. .Lctr_enc_just_ret:
  2480. ret
  2481. ENDPROC(aesni_ctr_enc)
  2482. /*
  2483. * _aesni_gf128mul_x_ble: internal ABI
  2484. * Multiply in GF(2^128) for XTS IVs
  2485. * input:
  2486. * IV: current IV
  2487. * GF128MUL_MASK == mask with 0x87 and 0x01
  2488. * output:
  2489. * IV: next IV
  2490. * changed:
  2491. * CTR: == temporary value
  2492. */
  2493. #define _aesni_gf128mul_x_ble() \
  2494. pshufd $0x13, IV, CTR; \
  2495. paddq IV, IV; \
  2496. psrad $31, CTR; \
  2497. pand GF128MUL_MASK, CTR; \
  2498. pxor CTR, IV;
  2499. /*
  2500. * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2501. * bool enc, u8 *iv)
  2502. */
  2503. ENTRY(aesni_xts_crypt8)
  2504. cmpb $0, %cl
  2505. movl $0, %ecx
  2506. movl $240, %r10d
  2507. leaq _aesni_enc4, %r11
  2508. leaq _aesni_dec4, %rax
  2509. cmovel %r10d, %ecx
  2510. cmoveq %rax, %r11
  2511. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2512. movups (IVP), IV
  2513. mov 480(KEYP), KLEN
  2514. addq %rcx, KEYP
  2515. movdqa IV, STATE1
  2516. movdqu 0x00(INP), INC
  2517. pxor INC, STATE1
  2518. movdqu IV, 0x00(OUTP)
  2519. _aesni_gf128mul_x_ble()
  2520. movdqa IV, STATE2
  2521. movdqu 0x10(INP), INC
  2522. pxor INC, STATE2
  2523. movdqu IV, 0x10(OUTP)
  2524. _aesni_gf128mul_x_ble()
  2525. movdqa IV, STATE3
  2526. movdqu 0x20(INP), INC
  2527. pxor INC, STATE3
  2528. movdqu IV, 0x20(OUTP)
  2529. _aesni_gf128mul_x_ble()
  2530. movdqa IV, STATE4
  2531. movdqu 0x30(INP), INC
  2532. pxor INC, STATE4
  2533. movdqu IV, 0x30(OUTP)
  2534. CALL_NOSPEC %r11
  2535. movdqu 0x00(OUTP), INC
  2536. pxor INC, STATE1
  2537. movdqu STATE1, 0x00(OUTP)
  2538. _aesni_gf128mul_x_ble()
  2539. movdqa IV, STATE1
  2540. movdqu 0x40(INP), INC
  2541. pxor INC, STATE1
  2542. movdqu IV, 0x40(OUTP)
  2543. movdqu 0x10(OUTP), INC
  2544. pxor INC, STATE2
  2545. movdqu STATE2, 0x10(OUTP)
  2546. _aesni_gf128mul_x_ble()
  2547. movdqa IV, STATE2
  2548. movdqu 0x50(INP), INC
  2549. pxor INC, STATE2
  2550. movdqu IV, 0x50(OUTP)
  2551. movdqu 0x20(OUTP), INC
  2552. pxor INC, STATE3
  2553. movdqu STATE3, 0x20(OUTP)
  2554. _aesni_gf128mul_x_ble()
  2555. movdqa IV, STATE3
  2556. movdqu 0x60(INP), INC
  2557. pxor INC, STATE3
  2558. movdqu IV, 0x60(OUTP)
  2559. movdqu 0x30(OUTP), INC
  2560. pxor INC, STATE4
  2561. movdqu STATE4, 0x30(OUTP)
  2562. _aesni_gf128mul_x_ble()
  2563. movdqa IV, STATE4
  2564. movdqu 0x70(INP), INC
  2565. pxor INC, STATE4
  2566. movdqu IV, 0x70(OUTP)
  2567. _aesni_gf128mul_x_ble()
  2568. movups IV, (IVP)
  2569. CALL_NOSPEC %r11
  2570. movdqu 0x40(OUTP), INC
  2571. pxor INC, STATE1
  2572. movdqu STATE1, 0x40(OUTP)
  2573. movdqu 0x50(OUTP), INC
  2574. pxor INC, STATE2
  2575. movdqu STATE2, 0x50(OUTP)
  2576. movdqu 0x60(OUTP), INC
  2577. pxor INC, STATE3
  2578. movdqu STATE3, 0x60(OUTP)
  2579. movdqu 0x70(OUTP), INC
  2580. pxor INC, STATE4
  2581. movdqu STATE4, 0x70(OUTP)
  2582. ret
  2583. ENDPROC(aesni_xts_crypt8)
  2584. #endif