memcpy_64.c 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. /*
  2. * Copyright 2011 Tilera Corporation. All Rights Reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation, version 2.
  7. *
  8. * This program is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11. * NON INFRINGEMENT. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/types.h>
  15. #include <linux/string.h>
  16. #include <linux/module.h>
  17. /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
  18. /* Must be 8 bytes in size. */
  19. #define op_t uint64_t
  20. /* Threshold value for when to enter the unrolled loops. */
  21. #define OP_T_THRES 16
  22. #if CHIP_L2_LINE_SIZE() != 64
  23. #error "Assumes 64 byte line size"
  24. #endif
  25. /* How many cache lines ahead should we prefetch? */
  26. #define PREFETCH_LINES_AHEAD 4
  27. /*
  28. * Provide "base versions" of load and store for the normal code path.
  29. * The kernel provides other versions for userspace copies.
  30. */
  31. #define ST(p, v) (*(p) = (v))
  32. #define LD(p) (*(p))
  33. #ifndef USERCOPY_FUNC
  34. #define ST1 ST
  35. #define ST2 ST
  36. #define ST4 ST
  37. #define ST8 ST
  38. #define LD1 LD
  39. #define LD2 LD
  40. #define LD4 LD
  41. #define LD8 LD
  42. #define RETVAL dstv
  43. void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
  44. #else
  45. /*
  46. * Special kernel version will provide implementation of the LDn/STn
  47. * macros to return a count of uncopied bytes due to mm fault.
  48. */
  49. #define RETVAL 0
  50. int __attribute__((optimize("omit-frame-pointer")))
  51. USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
  52. #endif
  53. {
  54. char *__restrict dst1 = (char *)dstv;
  55. const char *__restrict src1 = (const char *)srcv;
  56. const char *__restrict src1_end;
  57. const char *__restrict prefetch;
  58. op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
  59. op_t final; /* Final bytes to write to trailing word, if any */
  60. long i;
  61. if (n < 16) {
  62. for (; n; n--)
  63. ST1(dst1++, LD1(src1++));
  64. return RETVAL;
  65. }
  66. /*
  67. * Locate the end of source memory we will copy. Don't
  68. * prefetch past this.
  69. */
  70. src1_end = src1 + n - 1;
  71. /* Prefetch ahead a few cache lines, but not past the end. */
  72. prefetch = src1;
  73. for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
  74. __insn_prefetch(prefetch);
  75. prefetch += CHIP_L2_LINE_SIZE();
  76. prefetch = (prefetch < src1_end) ? prefetch : src1;
  77. }
  78. /* Copy bytes until dst is word-aligned. */
  79. for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
  80. ST1(dst1++, LD1(src1++));
  81. /* 8-byte pointer to destination memory. */
  82. dst8 = (op_t *)dst1;
  83. if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
  84. /* Unaligned copy. */
  85. op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3;
  86. const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
  87. -sizeof(op_t));
  88. const void *srci = (void *)src1;
  89. int m;
  90. m = (CHIP_L2_LINE_SIZE() << 2) -
  91. (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
  92. m = (n < m) ? n : m;
  93. m /= sizeof(op_t);
  94. /* Copy until 'dst' is cache-line-aligned. */
  95. n -= (sizeof(op_t) * m);
  96. switch (m % 4) {
  97. case 0:
  98. if (__builtin_expect(!m, 0))
  99. goto _M0;
  100. tmp1 = LD8(src8++);
  101. tmp2 = LD8(src8++);
  102. goto _8B3;
  103. case 2:
  104. m += 2;
  105. tmp3 = LD8(src8++);
  106. tmp0 = LD8(src8++);
  107. goto _8B1;
  108. case 3:
  109. m += 1;
  110. tmp2 = LD8(src8++);
  111. tmp3 = LD8(src8++);
  112. goto _8B2;
  113. case 1:
  114. m--;
  115. tmp0 = LD8(src8++);
  116. tmp1 = LD8(src8++);
  117. if (__builtin_expect(!m, 0))
  118. goto _8B0;
  119. }
  120. do {
  121. tmp2 = LD8(src8++);
  122. tmp0 = __insn_dblalign(tmp0, tmp1, srci);
  123. ST8(dst8++, tmp0);
  124. _8B3:
  125. tmp3 = LD8(src8++);
  126. tmp1 = __insn_dblalign(tmp1, tmp2, srci);
  127. ST8(dst8++, tmp1);
  128. _8B2:
  129. tmp0 = LD8(src8++);
  130. tmp2 = __insn_dblalign(tmp2, tmp3, srci);
  131. ST8(dst8++, tmp2);
  132. _8B1:
  133. tmp1 = LD8(src8++);
  134. tmp3 = __insn_dblalign(tmp3, tmp0, srci);
  135. ST8(dst8++, tmp3);
  136. m -= 4;
  137. } while (m);
  138. _8B0:
  139. tmp0 = __insn_dblalign(tmp0, tmp1, srci);
  140. ST8(dst8++, tmp0);
  141. src8--;
  142. _M0:
  143. if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
  144. op_t tmp4, tmp5, tmp6, tmp7, tmp8;
  145. prefetch = ((const char *)src8) +
  146. CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
  147. for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
  148. n -= CHIP_L2_LINE_SIZE()) {
  149. /* Prefetch and advance to next line to
  150. prefetch, but don't go past the end. */
  151. __insn_prefetch(prefetch);
  152. /* Make sure prefetch got scheduled
  153. earlier. */
  154. __asm__ ("" : : : "memory");
  155. prefetch += CHIP_L2_LINE_SIZE();
  156. prefetch = (prefetch < src1_end) ? prefetch :
  157. (const char *) src8;
  158. tmp1 = LD8(src8++);
  159. tmp2 = LD8(src8++);
  160. tmp3 = LD8(src8++);
  161. tmp4 = LD8(src8++);
  162. tmp5 = LD8(src8++);
  163. tmp6 = LD8(src8++);
  164. tmp7 = LD8(src8++);
  165. tmp8 = LD8(src8++);
  166. tmp0 = __insn_dblalign(tmp0, tmp1, srci);
  167. tmp1 = __insn_dblalign(tmp1, tmp2, srci);
  168. tmp2 = __insn_dblalign(tmp2, tmp3, srci);
  169. tmp3 = __insn_dblalign(tmp3, tmp4, srci);
  170. tmp4 = __insn_dblalign(tmp4, tmp5, srci);
  171. tmp5 = __insn_dblalign(tmp5, tmp6, srci);
  172. tmp6 = __insn_dblalign(tmp6, tmp7, srci);
  173. tmp7 = __insn_dblalign(tmp7, tmp8, srci);
  174. __insn_wh64(dst8);
  175. ST8(dst8++, tmp0);
  176. ST8(dst8++, tmp1);
  177. ST8(dst8++, tmp2);
  178. ST8(dst8++, tmp3);
  179. ST8(dst8++, tmp4);
  180. ST8(dst8++, tmp5);
  181. ST8(dst8++, tmp6);
  182. ST8(dst8++, tmp7);
  183. tmp0 = tmp8;
  184. }
  185. src8--;
  186. }
  187. /* Copy the rest 8-byte chunks. */
  188. if (n >= sizeof(op_t)) {
  189. tmp0 = LD8(src8++);
  190. for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
  191. tmp1 = LD8(src8++);
  192. tmp0 = __insn_dblalign(tmp0, tmp1, srci);
  193. ST8(dst8++, tmp0);
  194. tmp0 = tmp1;
  195. }
  196. src8--;
  197. }
  198. if (n == 0)
  199. return RETVAL;
  200. tmp0 = LD8(src8++);
  201. tmp1 = ((const char *)src8 <= src1_end)
  202. ? LD8((op_t *)src8) : 0;
  203. final = __insn_dblalign(tmp0, tmp1, srci);
  204. } else {
  205. /* Aligned copy. */
  206. const op_t *__restrict src8 = (const op_t *)src1;
  207. /* src8 and dst8 are both word-aligned. */
  208. if (n >= CHIP_L2_LINE_SIZE()) {
  209. /* Copy until 'dst' is cache-line-aligned. */
  210. for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
  211. n -= sizeof(op_t))
  212. ST8(dst8++, LD8(src8++));
  213. for (; n >= CHIP_L2_LINE_SIZE(); ) {
  214. op_t tmp0, tmp1, tmp2, tmp3;
  215. op_t tmp4, tmp5, tmp6, tmp7;
  216. /*
  217. * Prefetch and advance to next line
  218. * to prefetch, but don't go past the
  219. * end.
  220. */
  221. __insn_prefetch(prefetch);
  222. /* Make sure prefetch got scheduled
  223. earlier. */
  224. __asm__ ("" : : : "memory");
  225. prefetch += CHIP_L2_LINE_SIZE();
  226. prefetch = (prefetch < src1_end) ? prefetch :
  227. (const char *)src8;
  228. /*
  229. * Do all the loads before wh64. This
  230. * is necessary if [src8, src8+7] and
  231. * [dst8, dst8+7] share the same cache
  232. * line and dst8 <= src8, as can be
  233. * the case when called from memmove,
  234. * or with code tested on x86 whose
  235. * memcpy always works with forward
  236. * copies.
  237. */
  238. tmp0 = LD8(src8++);
  239. tmp1 = LD8(src8++);
  240. tmp2 = LD8(src8++);
  241. tmp3 = LD8(src8++);
  242. tmp4 = LD8(src8++);
  243. tmp5 = LD8(src8++);
  244. tmp6 = LD8(src8++);
  245. tmp7 = LD8(src8++);
  246. /* wh64 and wait for tmp7 load completion. */
  247. __asm__ ("move %0, %0; wh64 %1\n"
  248. : : "r"(tmp7), "r"(dst8));
  249. ST8(dst8++, tmp0);
  250. ST8(dst8++, tmp1);
  251. ST8(dst8++, tmp2);
  252. ST8(dst8++, tmp3);
  253. ST8(dst8++, tmp4);
  254. ST8(dst8++, tmp5);
  255. ST8(dst8++, tmp6);
  256. ST8(dst8++, tmp7);
  257. n -= CHIP_L2_LINE_SIZE();
  258. }
  259. #if CHIP_L2_LINE_SIZE() != 64
  260. # error "Fix code that assumes particular L2 cache line size."
  261. #endif
  262. }
  263. for (; n >= sizeof(op_t); n -= sizeof(op_t))
  264. ST8(dst8++, LD8(src8++));
  265. if (__builtin_expect(n == 0, 1))
  266. return RETVAL;
  267. final = LD8(src8);
  268. }
  269. /* n != 0 if we get here. Write out any trailing bytes. */
  270. dst1 = (char *)dst8;
  271. #ifndef __BIG_ENDIAN__
  272. if (n & 4) {
  273. ST4((uint32_t *)dst1, final);
  274. dst1 += 4;
  275. final >>= 32;
  276. n &= 3;
  277. }
  278. if (n & 2) {
  279. ST2((uint16_t *)dst1, final);
  280. dst1 += 2;
  281. final >>= 16;
  282. n &= 1;
  283. }
  284. if (n)
  285. ST1((uint8_t *)dst1, final);
  286. #else
  287. if (n & 4) {
  288. ST4((uint32_t *)dst1, final >> 32);
  289. dst1 += 4;
  290. }
  291. else
  292. {
  293. final >>= 32;
  294. }
  295. if (n & 2) {
  296. ST2((uint16_t *)dst1, final >> 16);
  297. dst1 += 2;
  298. }
  299. else
  300. {
  301. final >>= 16;
  302. }
  303. if (n & 1)
  304. ST1((uint8_t *)dst1, final >> 8);
  305. #endif
  306. return RETVAL;
  307. }
  308. #ifdef USERCOPY_FUNC
  309. #undef ST1
  310. #undef ST2
  311. #undef ST4
  312. #undef ST8
  313. #undef LD1
  314. #undef LD2
  315. #undef LD4
  316. #undef LD8
  317. #undef USERCOPY_FUNC
  318. #endif