copy_page.S 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. /*
  2. Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
  3. This file is subject to the terms and conditions of the GNU General Public
  4. License. See the file "COPYING" in the main directory of this archive
  5. for more details.
  6. Tight version of mempy for the case of just copying a page.
  7. Prefetch strategy empirically optimised against RTL simulations
  8. of SH5-101 cut2 eval chip with Cayman board DDR memory.
  9. Parameters:
  10. r2 : destination effective address (start of page)
  11. r3 : source effective address (start of page)
  12. Always copies 4096 bytes.
  13. Points to review.
  14. * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
  15. It seems like the prefetch needs to be at at least 4 lines ahead to get
  16. the data into the cache in time, and the allocos contend with outstanding
  17. prefetches for the same cache set, so it's better to have the numbers
  18. different.
  19. */
  20. .section .text..SHmedia32,"ax"
  21. .little
  22. .balign 8
  23. .global copy_page
  24. copy_page:
  25. /* Copy 4096 bytes worth of data from r3 to r2.
  26. Do prefetches 4 lines ahead.
  27. Do alloco 2 lines ahead */
  28. pta 1f, tr1
  29. pta 2f, tr2
  30. pta 3f, tr3
  31. ptabs r18, tr0
  32. #if 0
  33. /* TAKum03020 */
  34. ld.q r3, 0x00, r63
  35. ld.q r3, 0x20, r63
  36. ld.q r3, 0x40, r63
  37. ld.q r3, 0x60, r63
  38. #endif
  39. alloco r2, 0x00
  40. synco ! TAKum03020
  41. alloco r2, 0x20
  42. synco ! TAKum03020
  43. movi 3968, r6
  44. add r2, r6, r6
  45. addi r6, 64, r7
  46. addi r7, 64, r8
  47. sub r3, r2, r60
  48. addi r60, 8, r61
  49. addi r61, 8, r62
  50. addi r62, 8, r23
  51. addi r60, 0x80, r22
  52. /* Minimal code size. The extra branches inside the loop don't cost much
  53. because they overlap with the time spent waiting for prefetches to
  54. complete. */
  55. 1:
  56. #if 0
  57. /* TAKum03020 */
  58. bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
  59. ldx.q r2, r22, r63 ! prefetch 4 lines hence
  60. #endif
  61. 2:
  62. bge/u r2, r7, tr3 ! skip alloco for last 2 lines
  63. alloco r2, 0x40 ! alloc destination line 2 lines ahead
  64. synco ! TAKum03020
  65. 3:
  66. ldx.q r2, r60, r36
  67. ldx.q r2, r61, r37
  68. ldx.q r2, r62, r38
  69. ldx.q r2, r23, r39
  70. st.q r2, 0, r36
  71. st.q r2, 8, r37
  72. st.q r2, 16, r38
  73. st.q r2, 24, r39
  74. addi r2, 32, r2
  75. bgt/l r8, r2, tr1
  76. blink tr0, r63 ! return