mmx_32.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /*
  2. * MMX 3DNow! library helper functions
  3. *
  4. * To do:
  5. * We can use MMX just for prefetch in IRQ's. This may be a win.
  6. * (reported so on K6-III)
  7. * We should use a better code neutral filler for the short jump
  8. * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  9. * We also want to clobber the filler register so we don't get any
  10. * register forwarding stalls on the filler.
  11. *
  12. * Add *user handling. Checksums are not a win with MMX on any CPU
  13. * tested so far for any MMX solution figured.
  14. *
  15. * 22/09/2000 - Arjan van de Ven
  16. * Improved for non-egineering-sample Athlons
  17. *
  18. */
  19. #include <linux/hardirq.h>
  20. #include <linux/string.h>
  21. #include <linux/module.h>
  22. #include <linux/sched.h>
  23. #include <linux/types.h>
  24. #include <asm/fpu/api.h>
  25. #include <asm/asm.h>
  26. void *_mmx_memcpy(void *to, const void *from, size_t len)
  27. {
  28. void *p;
  29. int i;
  30. if (unlikely(in_interrupt()))
  31. return __memcpy(to, from, len);
  32. p = to;
  33. i = len >> 6; /* len/64 */
  34. kernel_fpu_begin();
  35. __asm__ __volatile__ (
  36. "1: prefetch (%0)\n" /* This set is 28 bytes */
  37. " prefetch 64(%0)\n"
  38. " prefetch 128(%0)\n"
  39. " prefetch 192(%0)\n"
  40. " prefetch 256(%0)\n"
  41. "2: \n"
  42. ".section .fixup, \"ax\"\n"
  43. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  44. " jmp 2b\n"
  45. ".previous\n"
  46. _ASM_EXTABLE(1b, 3b)
  47. : : "r" (from));
  48. for ( ; i > 5; i--) {
  49. __asm__ __volatile__ (
  50. "1: prefetch 320(%0)\n"
  51. "2: movq (%0), %%mm0\n"
  52. " movq 8(%0), %%mm1\n"
  53. " movq 16(%0), %%mm2\n"
  54. " movq 24(%0), %%mm3\n"
  55. " movq %%mm0, (%1)\n"
  56. " movq %%mm1, 8(%1)\n"
  57. " movq %%mm2, 16(%1)\n"
  58. " movq %%mm3, 24(%1)\n"
  59. " movq 32(%0), %%mm0\n"
  60. " movq 40(%0), %%mm1\n"
  61. " movq 48(%0), %%mm2\n"
  62. " movq 56(%0), %%mm3\n"
  63. " movq %%mm0, 32(%1)\n"
  64. " movq %%mm1, 40(%1)\n"
  65. " movq %%mm2, 48(%1)\n"
  66. " movq %%mm3, 56(%1)\n"
  67. ".section .fixup, \"ax\"\n"
  68. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  69. " jmp 2b\n"
  70. ".previous\n"
  71. _ASM_EXTABLE(1b, 3b)
  72. : : "r" (from), "r" (to) : "memory");
  73. from += 64;
  74. to += 64;
  75. }
  76. for ( ; i > 0; i--) {
  77. __asm__ __volatile__ (
  78. " movq (%0), %%mm0\n"
  79. " movq 8(%0), %%mm1\n"
  80. " movq 16(%0), %%mm2\n"
  81. " movq 24(%0), %%mm3\n"
  82. " movq %%mm0, (%1)\n"
  83. " movq %%mm1, 8(%1)\n"
  84. " movq %%mm2, 16(%1)\n"
  85. " movq %%mm3, 24(%1)\n"
  86. " movq 32(%0), %%mm0\n"
  87. " movq 40(%0), %%mm1\n"
  88. " movq 48(%0), %%mm2\n"
  89. " movq 56(%0), %%mm3\n"
  90. " movq %%mm0, 32(%1)\n"
  91. " movq %%mm1, 40(%1)\n"
  92. " movq %%mm2, 48(%1)\n"
  93. " movq %%mm3, 56(%1)\n"
  94. : : "r" (from), "r" (to) : "memory");
  95. from += 64;
  96. to += 64;
  97. }
  98. /*
  99. * Now do the tail of the block:
  100. */
  101. __memcpy(to, from, len & 63);
  102. kernel_fpu_end();
  103. return p;
  104. }
  105. EXPORT_SYMBOL(_mmx_memcpy);
  106. #ifdef CONFIG_MK7
  107. /*
  108. * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
  109. * other MMX using processors do not.
  110. */
  111. static void fast_clear_page(void *page)
  112. {
  113. int i;
  114. kernel_fpu_begin();
  115. __asm__ __volatile__ (
  116. " pxor %%mm0, %%mm0\n" : :
  117. );
  118. for (i = 0; i < 4096/64; i++) {
  119. __asm__ __volatile__ (
  120. " movntq %%mm0, (%0)\n"
  121. " movntq %%mm0, 8(%0)\n"
  122. " movntq %%mm0, 16(%0)\n"
  123. " movntq %%mm0, 24(%0)\n"
  124. " movntq %%mm0, 32(%0)\n"
  125. " movntq %%mm0, 40(%0)\n"
  126. " movntq %%mm0, 48(%0)\n"
  127. " movntq %%mm0, 56(%0)\n"
  128. : : "r" (page) : "memory");
  129. page += 64;
  130. }
  131. /*
  132. * Since movntq is weakly-ordered, a "sfence" is needed to become
  133. * ordered again:
  134. */
  135. __asm__ __volatile__("sfence\n"::);
  136. kernel_fpu_end();
  137. }
  138. static void fast_copy_page(void *to, void *from)
  139. {
  140. int i;
  141. kernel_fpu_begin();
  142. /*
  143. * maybe the prefetch stuff can go before the expensive fnsave...
  144. * but that is for later. -AV
  145. */
  146. __asm__ __volatile__(
  147. "1: prefetch (%0)\n"
  148. " prefetch 64(%0)\n"
  149. " prefetch 128(%0)\n"
  150. " prefetch 192(%0)\n"
  151. " prefetch 256(%0)\n"
  152. "2: \n"
  153. ".section .fixup, \"ax\"\n"
  154. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  155. " jmp 2b\n"
  156. ".previous\n"
  157. _ASM_EXTABLE(1b, 3b) : : "r" (from));
  158. for (i = 0; i < (4096-320)/64; i++) {
  159. __asm__ __volatile__ (
  160. "1: prefetch 320(%0)\n"
  161. "2: movq (%0), %%mm0\n"
  162. " movntq %%mm0, (%1)\n"
  163. " movq 8(%0), %%mm1\n"
  164. " movntq %%mm1, 8(%1)\n"
  165. " movq 16(%0), %%mm2\n"
  166. " movntq %%mm2, 16(%1)\n"
  167. " movq 24(%0), %%mm3\n"
  168. " movntq %%mm3, 24(%1)\n"
  169. " movq 32(%0), %%mm4\n"
  170. " movntq %%mm4, 32(%1)\n"
  171. " movq 40(%0), %%mm5\n"
  172. " movntq %%mm5, 40(%1)\n"
  173. " movq 48(%0), %%mm6\n"
  174. " movntq %%mm6, 48(%1)\n"
  175. " movq 56(%0), %%mm7\n"
  176. " movntq %%mm7, 56(%1)\n"
  177. ".section .fixup, \"ax\"\n"
  178. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  179. " jmp 2b\n"
  180. ".previous\n"
  181. _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
  182. from += 64;
  183. to += 64;
  184. }
  185. for (i = (4096-320)/64; i < 4096/64; i++) {
  186. __asm__ __volatile__ (
  187. "2: movq (%0), %%mm0\n"
  188. " movntq %%mm0, (%1)\n"
  189. " movq 8(%0), %%mm1\n"
  190. " movntq %%mm1, 8(%1)\n"
  191. " movq 16(%0), %%mm2\n"
  192. " movntq %%mm2, 16(%1)\n"
  193. " movq 24(%0), %%mm3\n"
  194. " movntq %%mm3, 24(%1)\n"
  195. " movq 32(%0), %%mm4\n"
  196. " movntq %%mm4, 32(%1)\n"
  197. " movq 40(%0), %%mm5\n"
  198. " movntq %%mm5, 40(%1)\n"
  199. " movq 48(%0), %%mm6\n"
  200. " movntq %%mm6, 48(%1)\n"
  201. " movq 56(%0), %%mm7\n"
  202. " movntq %%mm7, 56(%1)\n"
  203. : : "r" (from), "r" (to) : "memory");
  204. from += 64;
  205. to += 64;
  206. }
  207. /*
  208. * Since movntq is weakly-ordered, a "sfence" is needed to become
  209. * ordered again:
  210. */
  211. __asm__ __volatile__("sfence \n"::);
  212. kernel_fpu_end();
  213. }
  214. #else /* CONFIG_MK7 */
  215. /*
  216. * Generic MMX implementation without K7 specific streaming
  217. */
  218. static void fast_clear_page(void *page)
  219. {
  220. int i;
  221. kernel_fpu_begin();
  222. __asm__ __volatile__ (
  223. " pxor %%mm0, %%mm0\n" : :
  224. );
  225. for (i = 0; i < 4096/128; i++) {
  226. __asm__ __volatile__ (
  227. " movq %%mm0, (%0)\n"
  228. " movq %%mm0, 8(%0)\n"
  229. " movq %%mm0, 16(%0)\n"
  230. " movq %%mm0, 24(%0)\n"
  231. " movq %%mm0, 32(%0)\n"
  232. " movq %%mm0, 40(%0)\n"
  233. " movq %%mm0, 48(%0)\n"
  234. " movq %%mm0, 56(%0)\n"
  235. " movq %%mm0, 64(%0)\n"
  236. " movq %%mm0, 72(%0)\n"
  237. " movq %%mm0, 80(%0)\n"
  238. " movq %%mm0, 88(%0)\n"
  239. " movq %%mm0, 96(%0)\n"
  240. " movq %%mm0, 104(%0)\n"
  241. " movq %%mm0, 112(%0)\n"
  242. " movq %%mm0, 120(%0)\n"
  243. : : "r" (page) : "memory");
  244. page += 128;
  245. }
  246. kernel_fpu_end();
  247. }
  248. static void fast_copy_page(void *to, void *from)
  249. {
  250. int i;
  251. kernel_fpu_begin();
  252. __asm__ __volatile__ (
  253. "1: prefetch (%0)\n"
  254. " prefetch 64(%0)\n"
  255. " prefetch 128(%0)\n"
  256. " prefetch 192(%0)\n"
  257. " prefetch 256(%0)\n"
  258. "2: \n"
  259. ".section .fixup, \"ax\"\n"
  260. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  261. " jmp 2b\n"
  262. ".previous\n"
  263. _ASM_EXTABLE(1b, 3b) : : "r" (from));
  264. for (i = 0; i < 4096/64; i++) {
  265. __asm__ __volatile__ (
  266. "1: prefetch 320(%0)\n"
  267. "2: movq (%0), %%mm0\n"
  268. " movq 8(%0), %%mm1\n"
  269. " movq 16(%0), %%mm2\n"
  270. " movq 24(%0), %%mm3\n"
  271. " movq %%mm0, (%1)\n"
  272. " movq %%mm1, 8(%1)\n"
  273. " movq %%mm2, 16(%1)\n"
  274. " movq %%mm3, 24(%1)\n"
  275. " movq 32(%0), %%mm0\n"
  276. " movq 40(%0), %%mm1\n"
  277. " movq 48(%0), %%mm2\n"
  278. " movq 56(%0), %%mm3\n"
  279. " movq %%mm0, 32(%1)\n"
  280. " movq %%mm1, 40(%1)\n"
  281. " movq %%mm2, 48(%1)\n"
  282. " movq %%mm3, 56(%1)\n"
  283. ".section .fixup, \"ax\"\n"
  284. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  285. " jmp 2b\n"
  286. ".previous\n"
  287. _ASM_EXTABLE(1b, 3b)
  288. : : "r" (from), "r" (to) : "memory");
  289. from += 64;
  290. to += 64;
  291. }
  292. kernel_fpu_end();
  293. }
  294. #endif /* !CONFIG_MK7 */
  295. /*
  296. * Favour MMX for page clear and copy:
  297. */
  298. static void slow_zero_page(void *page)
  299. {
  300. int d0, d1;
  301. __asm__ __volatile__(
  302. "cld\n\t"
  303. "rep ; stosl"
  304. : "=&c" (d0), "=&D" (d1)
  305. :"a" (0), "1" (page), "0" (1024)
  306. :"memory");
  307. }
  308. void mmx_clear_page(void *page)
  309. {
  310. if (unlikely(in_interrupt()))
  311. slow_zero_page(page);
  312. else
  313. fast_clear_page(page);
  314. }
  315. EXPORT_SYMBOL(mmx_clear_page);
  316. static void slow_copy_page(void *to, void *from)
  317. {
  318. int d0, d1, d2;
  319. __asm__ __volatile__(
  320. "cld\n\t"
  321. "rep ; movsl"
  322. : "=&c" (d0), "=&D" (d1), "=&S" (d2)
  323. : "0" (1024), "1" ((long) to), "2" ((long) from)
  324. : "memory");
  325. }
  326. void mmx_copy_page(void *to, void *from)
  327. {
  328. if (unlikely(in_interrupt()))
  329. slow_copy_page(to, from);
  330. else
  331. fast_copy_page(to, from);
  332. }
  333. EXPORT_SYMBOL(mmx_copy_page);