clear_user.S 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /*
  2. * This routine clears to zero a linear memory buffer in user space.
  3. *
  4. * Inputs:
  5. * in0: address of buffer
  6. * in1: length of buffer in bytes
  7. * Outputs:
  8. * r8: number of bytes that didn't get cleared due to a fault
  9. *
  10. * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
  11. * Stephane Eranian <eranian@hpl.hp.com>
  12. */
  13. #include <asm/asmmacro.h>
  14. //
  15. // arguments
  16. //
  17. #define buf r32
  18. #define len r33
  19. //
  20. // local registers
  21. //
  22. #define cnt r16
  23. #define buf2 r17
  24. #define saved_lc r18
  25. #define saved_pfs r19
  26. #define tmp r20
  27. #define len2 r21
  28. #define len3 r22
  29. //
  30. // Theory of operations:
  31. // - we check whether or not the buffer is small, i.e., less than 17
  32. // in which case we do the byte by byte loop.
  33. //
  34. // - Otherwise we go progressively from 1 byte store to 8byte store in
  35. // the head part, the body is a 16byte store loop and we finish we the
  36. // tail for the last 15 bytes.
  37. // The good point about this breakdown is that the long buffer handling
  38. // contains only 2 branches.
  39. //
  40. // The reason for not using shifting & masking for both the head and the
  41. // tail is to stay semantically correct. This routine is not supposed
  42. // to write bytes outside of the buffer. While most of the time this would
  43. // be ok, we can't tolerate a mistake. A classical example is the case
  44. // of multithreaded code were to the extra bytes touched is actually owned
  45. // by another thread which runs concurrently to ours. Another, less likely,
  46. // example is with device drivers where reading an I/O mapped location may
  47. // have side effects (same thing for writing).
  48. //
  49. GLOBAL_ENTRY(__do_clear_user)
  50. .prologue
  51. .save ar.pfs, saved_pfs
  52. alloc saved_pfs=ar.pfs,2,0,0,0
  53. cmp.eq p6,p0=r0,len // check for zero length
  54. .save ar.lc, saved_lc
  55. mov saved_lc=ar.lc // preserve ar.lc (slow)
  56. .body
  57. ;; // avoid WAW on CFM
  58. adds tmp=-1,len // br.ctop is repeat/until
  59. mov ret0=len // return value is length at this point
  60. (p6) br.ret.spnt.many rp
  61. ;;
  62. cmp.lt p6,p0=16,len // if len > 16 then long memset
  63. mov ar.lc=tmp // initialize lc for small count
  64. (p6) br.cond.dptk .long_do_clear
  65. ;; // WAR on ar.lc
  66. //
  67. // worst case 16 iterations, avg 8 iterations
  68. //
  69. // We could have played with the predicates to use the extra
  70. // M slot for 2 stores/iteration but the cost the initialization
  71. // the various counters compared to how long the loop is supposed
  72. // to last on average does not make this solution viable.
  73. //
  74. 1:
  75. EX( .Lexit1, st1 [buf]=r0,1 )
  76. adds len=-1,len // countdown length using len
  77. br.cloop.dptk 1b
  78. ;; // avoid RAW on ar.lc
  79. //
  80. // .Lexit4: comes from byte by byte loop
  81. // len contains bytes left
  82. .Lexit1:
  83. mov ret0=len // faster than using ar.lc
  84. mov ar.lc=saved_lc
  85. br.ret.sptk.many rp // end of short clear_user
  86. //
  87. // At this point we know we have more than 16 bytes to copy
  88. // so we focus on alignment (no branches required)
  89. //
  90. // The use of len/len2 for countdown of the number of bytes left
  91. // instead of ret0 is due to the fact that the exception code
  92. // changes the values of r8.
  93. //
  94. .long_do_clear:
  95. tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
  96. ;;
  97. EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
  98. (p6) adds len=-1,len;; // sync because buf is modified
  99. tbit.nz p6,p0=buf,1
  100. ;;
  101. EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
  102. (p6) adds len=-2,len;;
  103. tbit.nz p6,p0=buf,2
  104. ;;
  105. EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
  106. (p6) adds len=-4,len;;
  107. tbit.nz p6,p0=buf,3
  108. ;;
  109. EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
  110. (p6) adds len=-8,len;;
  111. shr.u cnt=len,4 // number of 128-bit (2x64bit) words
  112. ;;
  113. cmp.eq p6,p0=r0,cnt
  114. adds tmp=-1,cnt
  115. (p6) br.cond.dpnt .dotail // we have less than 16 bytes left
  116. ;;
  117. adds buf2=8,buf // setup second base pointer
  118. mov ar.lc=tmp
  119. ;;
  120. //
  121. // 16bytes/iteration core loop
  122. //
  123. // The second store can never generate a fault because
  124. // we come into the loop only when we are 16-byte aligned.
  125. // This means that if we cross a page then it will always be
  126. // in the first store and never in the second.
  127. //
  128. //
  129. // We need to keep track of the remaining length. A possible (optimistic)
  130. // way would be to use ar.lc and derive how many byte were left by
  131. // doing : left= 16*ar.lc + 16. this would avoid the addition at
  132. // every iteration.
  133. // However we need to keep the synchronization point. A template
  134. // M;;MB does not exist and thus we can keep the addition at no
  135. // extra cycle cost (use a nop slot anyway). It also simplifies the
  136. // (unlikely) error recovery code
  137. //
  138. 2: EX(.Lexit3, st8 [buf]=r0,16 )
  139. ;; // needed to get len correct when error
  140. st8 [buf2]=r0,16
  141. adds len=-16,len
  142. br.cloop.dptk 2b
  143. ;;
  144. mov ar.lc=saved_lc
  145. //
  146. // tail correction based on len only
  147. //
  148. // We alternate the use of len3,len2 to allow parallelism and correct
  149. // error handling. We also reuse p6/p7 to return correct value.
  150. // The addition of len2/len3 does not cost anything more compared to
  151. // the regular memset as we had empty slots.
  152. //
  153. .dotail:
  154. mov len2=len // for parallelization of error handling
  155. mov len3=len
  156. tbit.nz p6,p0=len,3
  157. ;;
  158. EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
  159. (p6) adds len3=-8,len2
  160. tbit.nz p7,p6=len,2
  161. ;;
  162. EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
  163. (p7) adds len2=-4,len3
  164. tbit.nz p6,p7=len,1
  165. ;;
  166. EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
  167. (p6) adds len3=-2,len2
  168. tbit.nz p7,p6=len,0
  169. ;;
  170. EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
  171. mov ret0=r0 // success
  172. br.ret.sptk.many rp // end of most likely path
  173. //
  174. // Outlined error handling code
  175. //
  176. //
  177. // .Lexit3: comes from core loop, need restore pr/lc
  178. // len contains bytes left
  179. //
  180. //
  181. // .Lexit2:
  182. // if p6 -> coming from st8 or st2 : len2 contains what's left
  183. // if p7 -> coming from st4 or st1 : len3 contains what's left
  184. // We must restore lc/pr even though might not have been used.
  185. .Lexit2:
  186. .pred.rel "mutex", p6, p7
  187. (p6) mov len=len2
  188. (p7) mov len=len3
  189. ;;
  190. //
  191. // .Lexit4: comes from head, need not restore pr/lc
  192. // len contains bytes left
  193. //
  194. .Lexit3:
  195. mov ret0=len
  196. mov ar.lc=saved_lc
  197. br.ret.sptk.many rp
  198. END(__do_clear_user)