aes-spe-core.S 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*
  2. * Fast AES implementation for SPE instruction set (PPC)
  3. *
  4. * This code makes use of the SPE SIMD instruction set as defined in
  5. * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
  6. * Implementation is based on optimization guide notes from
  7. * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
  8. *
  9. * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
  10. *
  11. * This program is free software; you can redistribute it and/or modify it
  12. * under the terms of the GNU General Public License as published by the Free
  13. * Software Foundation; either version 2 of the License, or (at your option)
  14. * any later version.
  15. *
  16. */
  17. #include <asm/ppc_asm.h>
  18. #include "aes-spe-regs.h"
  19. #define EAD(in, bpos) \
  20. rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
  21. #define DAD(in, bpos) \
  22. rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
  23. #define LWH(out, off) \
  24. evlwwsplat out,off(rT0); /* load word high */
  25. #define LWL(out, off) \
  26. lwz out,off(rT0); /* load word low */
  27. #define LBZ(out, tab, off) \
  28. lbz out,off(tab); /* load byte */
  29. #define LAH(out, in, bpos, off) \
  30. EAD(in, bpos) /* calc addr + load word high */ \
  31. LWH(out, off)
  32. #define LAL(out, in, bpos, off) \
  33. EAD(in, bpos) /* calc addr + load word low */ \
  34. LWL(out, off)
  35. #define LAE(out, in, bpos) \
  36. EAD(in, bpos) /* calc addr + load enc byte */ \
  37. LBZ(out, rT0, 8)
  38. #define LBE(out) \
  39. LBZ(out, rT0, 8) /* load enc byte */
  40. #define LAD(out, in, bpos) \
  41. DAD(in, bpos) /* calc addr + load dec byte */ \
  42. LBZ(out, rT1, 0)
  43. #define LBD(out) \
  44. LBZ(out, rT1, 0)
  45. /*
  46. * ppc_encrypt_block: The central encryption function for a single 16 bytes
  47. * block. It does no stack handling or register saving to support fast calls
  48. * via bl/blr. It expects that caller has pre-xored input data with first
  49. * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  50. * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
  51. * and rW0-rW3 and caller must execute a final xor on the ouput registers.
  52. * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  53. *
  54. */
  55. _GLOBAL(ppc_encrypt_block)
  56. LAH(rW4, rD1, 2, 4)
  57. LAH(rW6, rD0, 3, 0)
  58. LAH(rW3, rD0, 1, 8)
  59. ppc_encrypt_block_loop:
  60. LAH(rW0, rD3, 0, 12)
  61. LAL(rW0, rD0, 0, 12)
  62. LAH(rW1, rD1, 0, 12)
  63. LAH(rW2, rD2, 1, 8)
  64. LAL(rW2, rD3, 1, 8)
  65. LAL(rW3, rD1, 1, 8)
  66. LAL(rW4, rD2, 2, 4)
  67. LAL(rW6, rD1, 3, 0)
  68. LAH(rW5, rD3, 2, 4)
  69. LAL(rW5, rD0, 2, 4)
  70. LAH(rW7, rD2, 3, 0)
  71. evldw rD1,16(rKP)
  72. EAD(rD3, 3)
  73. evxor rW2,rW2,rW4
  74. LWL(rW7, 0)
  75. evxor rW2,rW2,rW6
  76. EAD(rD2, 0)
  77. evxor rD1,rD1,rW2
  78. LWL(rW1, 12)
  79. evxor rD1,rD1,rW0
  80. evldw rD3,24(rKP)
  81. evmergehi rD0,rD0,rD1
  82. EAD(rD1, 2)
  83. evxor rW3,rW3,rW5
  84. LWH(rW4, 4)
  85. evxor rW3,rW3,rW7
  86. EAD(rD0, 3)
  87. evxor rD3,rD3,rW3
  88. LWH(rW6, 0)
  89. evxor rD3,rD3,rW1
  90. EAD(rD0, 1)
  91. evmergehi rD2,rD2,rD3
  92. LWH(rW3, 8)
  93. LAH(rW0, rD3, 0, 12)
  94. LAL(rW0, rD0, 0, 12)
  95. LAH(rW1, rD1, 0, 12)
  96. LAH(rW2, rD2, 1, 8)
  97. LAL(rW2, rD3, 1, 8)
  98. LAL(rW3, rD1, 1, 8)
  99. LAL(rW4, rD2, 2, 4)
  100. LAL(rW6, rD1, 3, 0)
  101. LAH(rW5, rD3, 2, 4)
  102. LAL(rW5, rD0, 2, 4)
  103. LAH(rW7, rD2, 3, 0)
  104. evldw rD1,32(rKP)
  105. EAD(rD3, 3)
  106. evxor rW2,rW2,rW4
  107. LWL(rW7, 0)
  108. evxor rW2,rW2,rW6
  109. EAD(rD2, 0)
  110. evxor rD1,rD1,rW2
  111. LWL(rW1, 12)
  112. evxor rD1,rD1,rW0
  113. evldw rD3,40(rKP)
  114. evmergehi rD0,rD0,rD1
  115. EAD(rD1, 2)
  116. evxor rW3,rW3,rW5
  117. LWH(rW4, 4)
  118. evxor rW3,rW3,rW7
  119. EAD(rD0, 3)
  120. evxor rD3,rD3,rW3
  121. LWH(rW6, 0)
  122. evxor rD3,rD3,rW1
  123. EAD(rD0, 1)
  124. evmergehi rD2,rD2,rD3
  125. LWH(rW3, 8)
  126. addi rKP,rKP,32
  127. bdnz ppc_encrypt_block_loop
  128. LAH(rW0, rD3, 0, 12)
  129. LAL(rW0, rD0, 0, 12)
  130. LAH(rW1, rD1, 0, 12)
  131. LAH(rW2, rD2, 1, 8)
  132. LAL(rW2, rD3, 1, 8)
  133. LAL(rW3, rD1, 1, 8)
  134. LAL(rW4, rD2, 2, 4)
  135. LAH(rW5, rD3, 2, 4)
  136. LAL(rW6, rD1, 3, 0)
  137. LAL(rW5, rD0, 2, 4)
  138. LAH(rW7, rD2, 3, 0)
  139. evldw rD1,16(rKP)
  140. EAD(rD3, 3)
  141. evxor rW2,rW2,rW4
  142. LWL(rW7, 0)
  143. evxor rW2,rW2,rW6
  144. EAD(rD2, 0)
  145. evxor rD1,rD1,rW2
  146. LWL(rW1, 12)
  147. evxor rD1,rD1,rW0
  148. evldw rD3,24(rKP)
  149. evmergehi rD0,rD0,rD1
  150. EAD(rD1, 0)
  151. evxor rW3,rW3,rW5
  152. LBE(rW2)
  153. evxor rW3,rW3,rW7
  154. EAD(rD0, 1)
  155. evxor rD3,rD3,rW3
  156. LBE(rW6)
  157. evxor rD3,rD3,rW1
  158. EAD(rD0, 0)
  159. evmergehi rD2,rD2,rD3
  160. LBE(rW1)
  161. LAE(rW0, rD3, 0)
  162. LAE(rW1, rD0, 0)
  163. LAE(rW4, rD2, 1)
  164. LAE(rW5, rD3, 1)
  165. LAE(rW3, rD2, 0)
  166. LAE(rW7, rD1, 1)
  167. rlwimi rW0,rW4,8,16,23
  168. rlwimi rW1,rW5,8,16,23
  169. LAE(rW4, rD1, 2)
  170. LAE(rW5, rD2, 2)
  171. rlwimi rW2,rW6,8,16,23
  172. rlwimi rW3,rW7,8,16,23
  173. LAE(rW6, rD3, 2)
  174. LAE(rW7, rD0, 2)
  175. rlwimi rW0,rW4,16,8,15
  176. rlwimi rW1,rW5,16,8,15
  177. LAE(rW4, rD0, 3)
  178. LAE(rW5, rD1, 3)
  179. rlwimi rW2,rW6,16,8,15
  180. lwz rD0,32(rKP)
  181. rlwimi rW3,rW7,16,8,15
  182. lwz rD1,36(rKP)
  183. LAE(rW6, rD2, 3)
  184. LAE(rW7, rD3, 3)
  185. rlwimi rW0,rW4,24,0,7
  186. lwz rD2,40(rKP)
  187. rlwimi rW1,rW5,24,0,7
  188. lwz rD3,44(rKP)
  189. rlwimi rW2,rW6,24,0,7
  190. rlwimi rW3,rW7,24,0,7
  191. blr
  192. /*
  193. * ppc_decrypt_block: The central decryption function for a single 16 bytes
  194. * block. It does no stack handling or register saving to support fast calls
  195. * via bl/blr. It expects that caller has pre-xored input data with first
  196. * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  197. * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
  198. * and rW0-rW3 and caller must execute a final xor on the ouput registers.
  199. * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  200. *
  201. */
  202. _GLOBAL(ppc_decrypt_block)
  203. LAH(rW0, rD1, 0, 12)
  204. LAH(rW6, rD0, 3, 0)
  205. LAH(rW3, rD0, 1, 8)
  206. ppc_decrypt_block_loop:
  207. LAH(rW1, rD3, 0, 12)
  208. LAL(rW0, rD2, 0, 12)
  209. LAH(rW2, rD2, 1, 8)
  210. LAL(rW2, rD3, 1, 8)
  211. LAH(rW4, rD3, 2, 4)
  212. LAL(rW4, rD0, 2, 4)
  213. LAL(rW6, rD1, 3, 0)
  214. LAH(rW5, rD1, 2, 4)
  215. LAH(rW7, rD2, 3, 0)
  216. LAL(rW7, rD3, 3, 0)
  217. LAL(rW3, rD1, 1, 8)
  218. evldw rD1,16(rKP)
  219. EAD(rD0, 0)
  220. evxor rW4,rW4,rW6
  221. LWL(rW1, 12)
  222. evxor rW0,rW0,rW4
  223. EAD(rD2, 2)
  224. evxor rW0,rW0,rW2
  225. LWL(rW5, 4)
  226. evxor rD1,rD1,rW0
  227. evldw rD3,24(rKP)
  228. evmergehi rD0,rD0,rD1
  229. EAD(rD1, 0)
  230. evxor rW3,rW3,rW7
  231. LWH(rW0, 12)
  232. evxor rW3,rW3,rW1
  233. EAD(rD0, 3)
  234. evxor rD3,rD3,rW3
  235. LWH(rW6, 0)
  236. evxor rD3,rD3,rW5
  237. EAD(rD0, 1)
  238. evmergehi rD2,rD2,rD3
  239. LWH(rW3, 8)
  240. LAH(rW1, rD3, 0, 12)
  241. LAL(rW0, rD2, 0, 12)
  242. LAH(rW2, rD2, 1, 8)
  243. LAL(rW2, rD3, 1, 8)
  244. LAH(rW4, rD3, 2, 4)
  245. LAL(rW4, rD0, 2, 4)
  246. LAL(rW6, rD1, 3, 0)
  247. LAH(rW5, rD1, 2, 4)
  248. LAH(rW7, rD2, 3, 0)
  249. LAL(rW7, rD3, 3, 0)
  250. LAL(rW3, rD1, 1, 8)
  251. evldw rD1,32(rKP)
  252. EAD(rD0, 0)
  253. evxor rW4,rW4,rW6
  254. LWL(rW1, 12)
  255. evxor rW0,rW0,rW4
  256. EAD(rD2, 2)
  257. evxor rW0,rW0,rW2
  258. LWL(rW5, 4)
  259. evxor rD1,rD1,rW0
  260. evldw rD3,40(rKP)
  261. evmergehi rD0,rD0,rD1
  262. EAD(rD1, 0)
  263. evxor rW3,rW3,rW7
  264. LWH(rW0, 12)
  265. evxor rW3,rW3,rW1
  266. EAD(rD0, 3)
  267. evxor rD3,rD3,rW3
  268. LWH(rW6, 0)
  269. evxor rD3,rD3,rW5
  270. EAD(rD0, 1)
  271. evmergehi rD2,rD2,rD3
  272. LWH(rW3, 8)
  273. addi rKP,rKP,32
  274. bdnz ppc_decrypt_block_loop
  275. LAH(rW1, rD3, 0, 12)
  276. LAL(rW0, rD2, 0, 12)
  277. LAH(rW2, rD2, 1, 8)
  278. LAL(rW2, rD3, 1, 8)
  279. LAH(rW4, rD3, 2, 4)
  280. LAL(rW4, rD0, 2, 4)
  281. LAL(rW6, rD1, 3, 0)
  282. LAH(rW5, rD1, 2, 4)
  283. LAH(rW7, rD2, 3, 0)
  284. LAL(rW7, rD3, 3, 0)
  285. LAL(rW3, rD1, 1, 8)
  286. evldw rD1,16(rKP)
  287. EAD(rD0, 0)
  288. evxor rW4,rW4,rW6
  289. LWL(rW1, 12)
  290. evxor rW0,rW0,rW4
  291. EAD(rD2, 2)
  292. evxor rW0,rW0,rW2
  293. LWL(rW5, 4)
  294. evxor rD1,rD1,rW0
  295. evldw rD3,24(rKP)
  296. evmergehi rD0,rD0,rD1
  297. DAD(rD1, 0)
  298. evxor rW3,rW3,rW7
  299. LBD(rW0)
  300. evxor rW3,rW3,rW1
  301. DAD(rD0, 1)
  302. evxor rD3,rD3,rW3
  303. LBD(rW6)
  304. evxor rD3,rD3,rW5
  305. DAD(rD0, 0)
  306. evmergehi rD2,rD2,rD3
  307. LBD(rW3)
  308. LAD(rW2, rD3, 0)
  309. LAD(rW1, rD2, 0)
  310. LAD(rW4, rD2, 1)
  311. LAD(rW5, rD3, 1)
  312. LAD(rW7, rD1, 1)
  313. rlwimi rW0,rW4,8,16,23
  314. rlwimi rW1,rW5,8,16,23
  315. LAD(rW4, rD3, 2)
  316. LAD(rW5, rD0, 2)
  317. rlwimi rW2,rW6,8,16,23
  318. rlwimi rW3,rW7,8,16,23
  319. LAD(rW6, rD1, 2)
  320. LAD(rW7, rD2, 2)
  321. rlwimi rW0,rW4,16,8,15
  322. rlwimi rW1,rW5,16,8,15
  323. LAD(rW4, rD0, 3)
  324. LAD(rW5, rD1, 3)
  325. rlwimi rW2,rW6,16,8,15
  326. lwz rD0,32(rKP)
  327. rlwimi rW3,rW7,16,8,15
  328. lwz rD1,36(rKP)
  329. LAD(rW6, rD2, 3)
  330. LAD(rW7, rD3, 3)
  331. rlwimi rW0,rW4,24,0,7
  332. lwz rD2,40(rKP)
  333. rlwimi rW1,rW5,24,0,7
  334. lwz rD3,44(rKP)
  335. rlwimi rW2,rW6,24,0,7
  336. rlwimi rW3,rW7,24,0,7
  337. blr