123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529 |
- /*
- * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
- *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
- /* included by aes-ce.S and aes-neon.S */
- .text
- .align 4
- /*
- * There are several ways to instantiate this code:
- * - no interleave, all inline
- * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
- * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
- * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
- * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
- *
- * Macros imported by this code:
- * - enc_prepare - setup NEON registers for encryption
- * - dec_prepare - setup NEON registers for decryption
- * - enc_switch_key - change to new key after having prepared for encryption
- * - encrypt_block - encrypt a single block
- * - decrypt block - decrypt a single block
- * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
- * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
- */
- #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
- #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
- #define FRAME_POP ldp x29, x30, [sp],#16
- #if INTERLEAVE == 2
- aes_encrypt_block2x:
- encrypt_block2x v0, v1, w3, x2, x6, w7
- ret
- ENDPROC(aes_encrypt_block2x)
- aes_decrypt_block2x:
- decrypt_block2x v0, v1, w3, x2, x6, w7
- ret
- ENDPROC(aes_decrypt_block2x)
- #elif INTERLEAVE == 4
- aes_encrypt_block4x:
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- ret
- ENDPROC(aes_encrypt_block4x)
- aes_decrypt_block4x:
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- ret
- ENDPROC(aes_decrypt_block4x)
- #else
- #error INTERLEAVE should equal 2 or 4
- #endif
- .macro do_encrypt_block2x
- bl aes_encrypt_block2x
- .endm
- .macro do_decrypt_block2x
- bl aes_decrypt_block2x
- .endm
- .macro do_encrypt_block4x
- bl aes_encrypt_block4x
- .endm
- .macro do_decrypt_block4x
- bl aes_decrypt_block4x
- .endm
- #else
- #define FRAME_PUSH
- #define FRAME_POP
- .macro do_encrypt_block2x
- encrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
- .macro do_decrypt_block2x
- decrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
- .macro do_encrypt_block4x
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
- .macro do_decrypt_block4x
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
- #endif
- /*
- * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
- * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
- */
- AES_ENTRY(aes_ecb_encrypt)
- FRAME_PUSH
- cbz w5, .LecbencloopNx
- enc_prepare w3, x2, x5
- .LecbencloopNx:
- #if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
- bmi .Lecbenc1x
- #if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- do_encrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
- #else
- ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- do_encrypt_block4x
- st1 {v0.16b-v3.16b}, [x0], #64
- #endif
- b .LecbencloopNx
- .Lecbenc1x:
- adds w4, w4, #INTERLEAVE
- beq .Lecbencout
- #endif
- .Lecbencloop:
- ld1 {v0.16b}, [x1], #16 /* get next pt block */
- encrypt_block v0, w3, x2, x5, w6
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- bne .Lecbencloop
- .Lecbencout:
- FRAME_POP
- ret
- AES_ENDPROC(aes_ecb_encrypt)
- AES_ENTRY(aes_ecb_decrypt)
- FRAME_PUSH
- cbz w5, .LecbdecloopNx
- dec_prepare w3, x2, x5
- .LecbdecloopNx:
- #if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
- bmi .Lecbdec1x
- #if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- do_decrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
- #else
- ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- do_decrypt_block4x
- st1 {v0.16b-v3.16b}, [x0], #64
- #endif
- b .LecbdecloopNx
- .Lecbdec1x:
- adds w4, w4, #INTERLEAVE
- beq .Lecbdecout
- #endif
- .Lecbdecloop:
- ld1 {v0.16b}, [x1], #16 /* get next ct block */
- decrypt_block v0, w3, x2, x5, w6
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- bne .Lecbdecloop
- .Lecbdecout:
- FRAME_POP
- ret
- AES_ENDPROC(aes_ecb_decrypt)
- /*
- * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
- * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
- */
- AES_ENTRY(aes_cbc_encrypt)
- cbz w6, .Lcbcencloop
- ld1 {v0.16b}, [x5] /* get iv */
- enc_prepare w3, x2, x6
- .Lcbcencloop:
- ld1 {v1.16b}, [x1], #16 /* get next pt block */
- eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
- encrypt_block v0, w3, x2, x6, w7
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- bne .Lcbcencloop
- st1 {v0.16b}, [x5] /* return iv */
- ret
- AES_ENDPROC(aes_cbc_encrypt)
- AES_ENTRY(aes_cbc_decrypt)
- FRAME_PUSH
- cbz w6, .LcbcdecloopNx
- ld1 {v7.16b}, [x5] /* get iv */
- dec_prepare w3, x2, x6
- .LcbcdecloopNx:
- #if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
- bmi .Lcbcdec1x
- #if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- mov v2.16b, v0.16b
- mov v3.16b, v1.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v7.16b
- eor v1.16b, v1.16b, v2.16b
- mov v7.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- #else
- ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- mov v4.16b, v0.16b
- mov v5.16b, v1.16b
- mov v6.16b, v2.16b
- do_decrypt_block4x
- sub x1, x1, #16
- eor v0.16b, v0.16b, v7.16b
- eor v1.16b, v1.16b, v4.16b
- ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
- eor v2.16b, v2.16b, v5.16b
- eor v3.16b, v3.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- #endif
- b .LcbcdecloopNx
- .Lcbcdec1x:
- adds w4, w4, #INTERLEAVE
- beq .Lcbcdecout
- #endif
- .Lcbcdecloop:
- ld1 {v1.16b}, [x1], #16 /* get next ct block */
- mov v0.16b, v1.16b /* ...and copy to v0 */
- decrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
- mov v7.16b, v1.16b /* ct is next iv */
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- bne .Lcbcdecloop
- .Lcbcdecout:
- FRAME_POP
- st1 {v7.16b}, [x5] /* return iv */
- ret
- AES_ENDPROC(aes_cbc_decrypt)
- /*
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 ctr[], int first)
- */
- AES_ENTRY(aes_ctr_encrypt)
- FRAME_PUSH
- cbz w6, .Lctrnotfirst /* 1st time around? */
- enc_prepare w3, x2, x6
- ld1 {v4.16b}, [x5]
- .Lctrnotfirst:
- umov x8, v4.d[1] /* keep swabbed ctr in reg */
- rev x8, x8
- #if INTERLEAVE >= 2
- cmn w8, w4 /* 32 bit overflow? */
- bcs .Lctrloop
- .LctrloopNx:
- subs w4, w4, #INTERLEAVE
- bmi .Lctr1x
- #if INTERLEAVE == 2
- mov v0.8b, v4.8b
- mov v1.8b, v4.8b
- rev x7, x8
- add x8, x8, #1
- ins v0.d[1], x7
- rev x7, x8
- add x8, x8, #1
- ins v1.d[1], x7
- ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
- do_encrypt_block2x
- eor v0.16b, v0.16b, v2.16b
- eor v1.16b, v1.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- #else
- ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
- dup v7.4s, w8
- mov v0.16b, v4.16b
- add v7.4s, v7.4s, v8.4s
- mov v1.16b, v4.16b
- rev32 v8.16b, v7.16b
- mov v2.16b, v4.16b
- mov v3.16b, v4.16b
- mov v1.s[3], v8.s[0]
- mov v2.s[3], v8.s[1]
- mov v3.s[3], v8.s[2]
- ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
- do_encrypt_block4x
- eor v0.16b, v5.16b, v0.16b
- ld1 {v5.16b}, [x1], #16 /* get 1 input block */
- eor v1.16b, v6.16b, v1.16b
- eor v2.16b, v7.16b, v2.16b
- eor v3.16b, v5.16b, v3.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- add x8, x8, #INTERLEAVE
- #endif
- rev x7, x8
- ins v4.d[1], x7
- cbz w4, .Lctrout
- b .LctrloopNx
- .Lctr1x:
- adds w4, w4, #INTERLEAVE
- beq .Lctrout
- #endif
- .Lctrloop:
- mov v0.16b, v4.16b
- encrypt_block v0, w3, x2, x6, w7
- adds x8, x8, #1 /* increment BE ctr */
- rev x7, x8
- ins v4.d[1], x7
- bcs .Lctrcarry /* overflow? */
- .Lctrcarrydone:
- subs w4, w4, #1
- bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
- ld1 {v3.16b}, [x1], #16
- eor v3.16b, v0.16b, v3.16b
- st1 {v3.16b}, [x0], #16
- bne .Lctrloop
- .Lctrout:
- st1 {v4.16b}, [x5] /* return next CTR value */
- FRAME_POP
- ret
- .Lctrhalfblock:
- ld1 {v3.8b}, [x1]
- eor v3.8b, v0.8b, v3.8b
- st1 {v3.8b}, [x0]
- FRAME_POP
- ret
- .Lctrcarry:
- umov x7, v4.d[0] /* load upper word of ctr */
- rev x7, x7 /* ... to handle the carry */
- add x7, x7, #1
- rev x7, x7
- ins v4.d[0], x7
- b .Lctrcarrydone
- AES_ENDPROC(aes_ctr_encrypt)
- .ltorg
- /*
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
- */
- .macro next_tweak, out, in, const, tmp
- sshr \tmp\().2d, \in\().2d, #63
- and \tmp\().16b, \tmp\().16b, \const\().16b
- add \out\().2d, \in\().2d, \in\().2d
- ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
- eor \out\().16b, \out\().16b, \tmp\().16b
- .endm
- .Lxts_mul_x:
- CPU_LE( .quad 1, 0x87 )
- CPU_BE( .quad 0x87, 1 )
- AES_ENTRY(aes_xts_encrypt)
- FRAME_PUSH
- cbz w7, .LxtsencloopNx
- ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- enc_switch_key w3, x2, x6
- ldr q7, .Lxts_mul_x
- b .LxtsencNx
- .LxtsencloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
- .LxtsencNx:
- #if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
- bmi .Lxtsenc1x
- #if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_encrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsencoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsencNx
- .LxtsencoutNx:
- mov v4.16b, v5.16b
- b .Lxtsencout
- #else
- ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
- eor v3.16b, v3.16b, v7.16b
- do_encrypt_block4x
- eor v3.16b, v3.16b, v7.16b
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- mov v4.16b, v7.16b
- cbz w4, .Lxtsencout
- b .LxtsencloopNx
- #endif
- .Lxtsenc1x:
- adds w4, w4, #INTERLEAVE
- beq .Lxtsencout
- #endif
- .Lxtsencloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- encrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v4.16b
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsencout
- next_tweak v4, v4, v7, v8
- b .Lxtsencloop
- .Lxtsencout:
- FRAME_POP
- ret
- AES_ENDPROC(aes_xts_encrypt)
- AES_ENTRY(aes_xts_decrypt)
- FRAME_PUSH
- cbz w7, .LxtsdecloopNx
- ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- dec_prepare w3, x2, x6
- ldr q7, .Lxts_mul_x
- b .LxtsdecNx
- .LxtsdecloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
- .LxtsdecNx:
- #if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
- bmi .Lxtsdec1x
- #if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsdecoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsdecNx
- .LxtsdecoutNx:
- mov v4.16b, v5.16b
- b .Lxtsdecout
- #else
- ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
- eor v3.16b, v3.16b, v7.16b
- do_decrypt_block4x
- eor v3.16b, v3.16b, v7.16b
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- mov v4.16b, v7.16b
- cbz w4, .Lxtsdecout
- b .LxtsdecloopNx
- #endif
- .Lxtsdec1x:
- adds w4, w4, #INTERLEAVE
- beq .Lxtsdecout
- #endif
- .Lxtsdecloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- decrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v4.16b
- st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsdecout
- next_tweak v4, v4, v7, v8
- b .Lxtsdecloop
- .Lxtsdecout:
- FRAME_POP
- ret
- AES_ENDPROC(aes_xts_decrypt)
|