read_write.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/fsnotify.h>
  12. #include <linux/security.h>
  13. #include <linux/export.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/splice.h>
  17. #include <linux/compat.h>
  18. #include "internal.h"
  19. #include <asm/uaccess.h>
  20. #include <asm/unistd.h>
  21. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  22. typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  23. const struct file_operations generic_ro_fops = {
  24. .llseek = generic_file_llseek,
  25. .read_iter = generic_file_read_iter,
  26. .mmap = generic_file_readonly_mmap,
  27. .splice_read = generic_file_splice_read,
  28. };
  29. EXPORT_SYMBOL(generic_ro_fops);
  30. static inline int unsigned_offsets(struct file *file)
  31. {
  32. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  33. }
  34. /**
  35. * vfs_setpos - update the file offset for lseek
  36. * @file: file structure in question
  37. * @offset: file offset to seek to
  38. * @maxsize: maximum file size
  39. *
  40. * This is a low-level filesystem helper for updating the file offset to
  41. * the value specified by @offset if the given offset is valid and it is
  42. * not equal to the current file offset.
  43. *
  44. * Return the specified offset on success and -EINVAL on invalid offset.
  45. */
  46. loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  47. {
  48. if (offset < 0 && !unsigned_offsets(file))
  49. return -EINVAL;
  50. if (offset > maxsize)
  51. return -EINVAL;
  52. if (offset != file->f_pos) {
  53. file->f_pos = offset;
  54. file->f_version = 0;
  55. }
  56. return offset;
  57. }
  58. EXPORT_SYMBOL(vfs_setpos);
  59. /**
  60. * generic_file_llseek_size - generic llseek implementation for regular files
  61. * @file: file structure to seek on
  62. * @offset: file offset to seek to
  63. * @whence: type of seek
  64. * @size: max size of this file in file system
  65. * @eof: offset used for SEEK_END position
  66. *
  67. * This is a variant of generic_file_llseek that allows passing in a custom
  68. * maximum file size and a custom EOF position, for e.g. hashed directories
  69. *
  70. * Synchronization:
  71. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  72. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  73. * read/writes behave like SEEK_SET against seeks.
  74. */
  75. loff_t
  76. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  77. loff_t maxsize, loff_t eof)
  78. {
  79. switch (whence) {
  80. case SEEK_END:
  81. offset += eof;
  82. break;
  83. case SEEK_CUR:
  84. /*
  85. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  86. * position-querying operation. Avoid rewriting the "same"
  87. * f_pos value back to the file because a concurrent read(),
  88. * write() or lseek() might have altered it
  89. */
  90. if (offset == 0)
  91. return file->f_pos;
  92. /*
  93. * f_lock protects against read/modify/write race with other
  94. * SEEK_CURs. Note that parallel writes and reads behave
  95. * like SEEK_SET.
  96. */
  97. spin_lock(&file->f_lock);
  98. offset = vfs_setpos(file, file->f_pos + offset, maxsize);
  99. spin_unlock(&file->f_lock);
  100. return offset;
  101. case SEEK_DATA:
  102. /*
  103. * In the generic case the entire file is data, so as long as
  104. * offset isn't at the end of the file then the offset is data.
  105. */
  106. if ((unsigned long long)offset >= eof)
  107. return -ENXIO;
  108. break;
  109. case SEEK_HOLE:
  110. /*
  111. * There is a virtual hole at the end of the file, so as long as
  112. * offset isn't i_size or larger, return i_size.
  113. */
  114. if ((unsigned long long)offset >= eof)
  115. return -ENXIO;
  116. offset = eof;
  117. break;
  118. }
  119. return vfs_setpos(file, offset, maxsize);
  120. }
  121. EXPORT_SYMBOL(generic_file_llseek_size);
  122. /**
  123. * generic_file_llseek - generic llseek implementation for regular files
  124. * @file: file structure to seek on
  125. * @offset: file offset to seek to
  126. * @whence: type of seek
  127. *
  128. * This is a generic implemenation of ->llseek useable for all normal local
  129. * filesystems. It just updates the file offset to the value specified by
  130. * @offset and @whence.
  131. */
  132. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  133. {
  134. struct inode *inode = file->f_mapping->host;
  135. return generic_file_llseek_size(file, offset, whence,
  136. inode->i_sb->s_maxbytes,
  137. i_size_read(inode));
  138. }
  139. EXPORT_SYMBOL(generic_file_llseek);
  140. /**
  141. * fixed_size_llseek - llseek implementation for fixed-sized devices
  142. * @file: file structure to seek on
  143. * @offset: file offset to seek to
  144. * @whence: type of seek
  145. * @size: size of the file
  146. *
  147. */
  148. loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
  149. {
  150. switch (whence) {
  151. case SEEK_SET: case SEEK_CUR: case SEEK_END:
  152. return generic_file_llseek_size(file, offset, whence,
  153. size, size);
  154. default:
  155. return -EINVAL;
  156. }
  157. }
  158. EXPORT_SYMBOL(fixed_size_llseek);
  159. /**
  160. * noop_llseek - No Operation Performed llseek implementation
  161. * @file: file structure to seek on
  162. * @offset: file offset to seek to
  163. * @whence: type of seek
  164. *
  165. * This is an implementation of ->llseek useable for the rare special case when
  166. * userspace expects the seek to succeed but the (device) file is actually not
  167. * able to perform the seek. In this case you use noop_llseek() instead of
  168. * falling back to the default implementation of ->llseek.
  169. */
  170. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  171. {
  172. return file->f_pos;
  173. }
  174. EXPORT_SYMBOL(noop_llseek);
  175. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  176. {
  177. return -ESPIPE;
  178. }
  179. EXPORT_SYMBOL(no_llseek);
  180. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  181. {
  182. struct inode *inode = file_inode(file);
  183. loff_t retval;
  184. mutex_lock(&inode->i_mutex);
  185. switch (whence) {
  186. case SEEK_END:
  187. offset += i_size_read(inode);
  188. break;
  189. case SEEK_CUR:
  190. if (offset == 0) {
  191. retval = file->f_pos;
  192. goto out;
  193. }
  194. offset += file->f_pos;
  195. break;
  196. case SEEK_DATA:
  197. /*
  198. * In the generic case the entire file is data, so as
  199. * long as offset isn't at the end of the file then the
  200. * offset is data.
  201. */
  202. if (offset >= inode->i_size) {
  203. retval = -ENXIO;
  204. goto out;
  205. }
  206. break;
  207. case SEEK_HOLE:
  208. /*
  209. * There is a virtual hole at the end of the file, so
  210. * as long as offset isn't i_size or larger, return
  211. * i_size.
  212. */
  213. if (offset >= inode->i_size) {
  214. retval = -ENXIO;
  215. goto out;
  216. }
  217. offset = inode->i_size;
  218. break;
  219. }
  220. retval = -EINVAL;
  221. if (offset >= 0 || unsigned_offsets(file)) {
  222. if (offset != file->f_pos) {
  223. file->f_pos = offset;
  224. file->f_version = 0;
  225. }
  226. retval = offset;
  227. }
  228. out:
  229. mutex_unlock(&inode->i_mutex);
  230. return retval;
  231. }
  232. EXPORT_SYMBOL(default_llseek);
  233. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  234. {
  235. loff_t (*fn)(struct file *, loff_t, int);
  236. fn = no_llseek;
  237. if (file->f_mode & FMODE_LSEEK) {
  238. if (file->f_op->llseek)
  239. fn = file->f_op->llseek;
  240. }
  241. return fn(file, offset, whence);
  242. }
  243. EXPORT_SYMBOL(vfs_llseek);
  244. static inline struct fd fdget_pos(int fd)
  245. {
  246. return __to_fd(__fdget_pos(fd));
  247. }
  248. static inline void fdput_pos(struct fd f)
  249. {
  250. if (f.flags & FDPUT_POS_UNLOCK)
  251. mutex_unlock(&f.file->f_pos_lock);
  252. fdput(f);
  253. }
  254. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  255. {
  256. off_t retval;
  257. struct fd f = fdget_pos(fd);
  258. if (!f.file)
  259. return -EBADF;
  260. retval = -EINVAL;
  261. if (whence <= SEEK_MAX) {
  262. loff_t res = vfs_llseek(f.file, offset, whence);
  263. retval = res;
  264. if (res != (loff_t)retval)
  265. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  266. }
  267. fdput_pos(f);
  268. return retval;
  269. }
  270. #ifdef CONFIG_COMPAT
  271. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  272. {
  273. return sys_lseek(fd, offset, whence);
  274. }
  275. #endif
  276. #ifdef __ARCH_WANT_SYS_LLSEEK
  277. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  278. unsigned long, offset_low, loff_t __user *, result,
  279. unsigned int, whence)
  280. {
  281. int retval;
  282. struct fd f = fdget_pos(fd);
  283. loff_t offset;
  284. if (!f.file)
  285. return -EBADF;
  286. retval = -EINVAL;
  287. if (whence > SEEK_MAX)
  288. goto out_putf;
  289. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  290. whence);
  291. retval = (int)offset;
  292. if (offset >= 0) {
  293. retval = -EFAULT;
  294. if (!copy_to_user(result, &offset, sizeof(offset)))
  295. retval = 0;
  296. }
  297. out_putf:
  298. fdput_pos(f);
  299. return retval;
  300. }
  301. #endif
  302. ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
  303. {
  304. struct kiocb kiocb;
  305. ssize_t ret;
  306. if (!file->f_op->read_iter)
  307. return -EINVAL;
  308. init_sync_kiocb(&kiocb, file);
  309. kiocb.ki_pos = *ppos;
  310. iter->type |= READ;
  311. ret = file->f_op->read_iter(&kiocb, iter);
  312. BUG_ON(ret == -EIOCBQUEUED);
  313. if (ret > 0)
  314. *ppos = kiocb.ki_pos;
  315. return ret;
  316. }
  317. EXPORT_SYMBOL(vfs_iter_read);
  318. ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
  319. {
  320. struct kiocb kiocb;
  321. ssize_t ret;
  322. if (!file->f_op->write_iter)
  323. return -EINVAL;
  324. init_sync_kiocb(&kiocb, file);
  325. kiocb.ki_pos = *ppos;
  326. iter->type |= WRITE;
  327. ret = file->f_op->write_iter(&kiocb, iter);
  328. BUG_ON(ret == -EIOCBQUEUED);
  329. if (ret > 0) {
  330. *ppos = kiocb.ki_pos;
  331. fsnotify_modify(file);
  332. }
  333. return ret;
  334. }
  335. EXPORT_SYMBOL(vfs_iter_write);
  336. /*
  337. * rw_verify_area doesn't like huge counts. We limit
  338. * them to something that fits in "int" so that others
  339. * won't have to do range checks all the time.
  340. */
  341. int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
  342. {
  343. struct inode *inode;
  344. loff_t pos;
  345. int retval = -EINVAL;
  346. inode = file_inode(file);
  347. if (unlikely((ssize_t) count < 0))
  348. return retval;
  349. pos = *ppos;
  350. if (unlikely(pos < 0)) {
  351. if (!unsigned_offsets(file))
  352. return retval;
  353. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  354. return -EOVERFLOW;
  355. } else if (unlikely((loff_t) (pos + count) < 0)) {
  356. if (!unsigned_offsets(file))
  357. return retval;
  358. }
  359. if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
  360. retval = locks_mandatory_area(
  361. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  362. inode, file, pos, count);
  363. if (retval < 0)
  364. return retval;
  365. }
  366. retval = security_file_permission(file,
  367. read_write == READ ? MAY_READ : MAY_WRITE);
  368. if (retval)
  369. return retval;
  370. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  371. }
  372. static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  373. {
  374. struct iovec iov = { .iov_base = buf, .iov_len = len };
  375. struct kiocb kiocb;
  376. struct iov_iter iter;
  377. ssize_t ret;
  378. init_sync_kiocb(&kiocb, filp);
  379. kiocb.ki_pos = *ppos;
  380. iov_iter_init(&iter, READ, &iov, 1, len);
  381. ret = filp->f_op->read_iter(&kiocb, &iter);
  382. BUG_ON(ret == -EIOCBQUEUED);
  383. *ppos = kiocb.ki_pos;
  384. return ret;
  385. }
  386. ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
  387. loff_t *pos)
  388. {
  389. if (file->f_op->read)
  390. return file->f_op->read(file, buf, count, pos);
  391. else if (file->f_op->read_iter)
  392. return new_sync_read(file, buf, count, pos);
  393. else
  394. return -EINVAL;
  395. }
  396. EXPORT_SYMBOL(__vfs_read);
  397. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  398. {
  399. ssize_t ret;
  400. if (!(file->f_mode & FMODE_READ))
  401. return -EBADF;
  402. if (!(file->f_mode & FMODE_CAN_READ))
  403. return -EINVAL;
  404. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  405. return -EFAULT;
  406. ret = rw_verify_area(READ, file, pos, count);
  407. if (ret >= 0) {
  408. count = ret;
  409. ret = __vfs_read(file, buf, count, pos);
  410. if (ret > 0) {
  411. fsnotify_access(file);
  412. add_rchar(current, ret);
  413. }
  414. inc_syscr(current);
  415. }
  416. return ret;
  417. }
  418. EXPORT_SYMBOL(vfs_read);
  419. static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  420. {
  421. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  422. struct kiocb kiocb;
  423. struct iov_iter iter;
  424. ssize_t ret;
  425. init_sync_kiocb(&kiocb, filp);
  426. kiocb.ki_pos = *ppos;
  427. iov_iter_init(&iter, WRITE, &iov, 1, len);
  428. ret = filp->f_op->write_iter(&kiocb, &iter);
  429. BUG_ON(ret == -EIOCBQUEUED);
  430. if (ret > 0)
  431. *ppos = kiocb.ki_pos;
  432. return ret;
  433. }
  434. ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
  435. loff_t *pos)
  436. {
  437. if (file->f_op->write)
  438. return file->f_op->write(file, p, count, pos);
  439. else if (file->f_op->write_iter)
  440. return new_sync_write(file, p, count, pos);
  441. else
  442. return -EINVAL;
  443. }
  444. EXPORT_SYMBOL(__vfs_write);
  445. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  446. {
  447. mm_segment_t old_fs;
  448. const char __user *p;
  449. ssize_t ret;
  450. if (!(file->f_mode & FMODE_CAN_WRITE))
  451. return -EINVAL;
  452. old_fs = get_fs();
  453. set_fs(get_ds());
  454. p = (__force const char __user *)buf;
  455. if (count > MAX_RW_COUNT)
  456. count = MAX_RW_COUNT;
  457. ret = __vfs_write(file, p, count, pos);
  458. set_fs(old_fs);
  459. if (ret > 0) {
  460. fsnotify_modify(file);
  461. add_wchar(current, ret);
  462. }
  463. inc_syscw(current);
  464. return ret;
  465. }
  466. EXPORT_SYMBOL(__kernel_write);
  467. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  468. {
  469. ssize_t ret;
  470. if (!(file->f_mode & FMODE_WRITE))
  471. return -EBADF;
  472. if (!(file->f_mode & FMODE_CAN_WRITE))
  473. return -EINVAL;
  474. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  475. return -EFAULT;
  476. ret = rw_verify_area(WRITE, file, pos, count);
  477. if (ret >= 0) {
  478. count = ret;
  479. file_start_write(file);
  480. ret = __vfs_write(file, buf, count, pos);
  481. if (ret > 0) {
  482. fsnotify_modify(file);
  483. add_wchar(current, ret);
  484. }
  485. inc_syscw(current);
  486. file_end_write(file);
  487. }
  488. return ret;
  489. }
  490. EXPORT_SYMBOL(vfs_write);
  491. static inline loff_t file_pos_read(struct file *file)
  492. {
  493. return file->f_pos;
  494. }
  495. static inline void file_pos_write(struct file *file, loff_t pos)
  496. {
  497. file->f_pos = pos;
  498. }
  499. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  500. {
  501. struct fd f = fdget_pos(fd);
  502. ssize_t ret = -EBADF;
  503. if (f.file) {
  504. loff_t pos = file_pos_read(f.file);
  505. ret = vfs_read(f.file, buf, count, &pos);
  506. if (ret >= 0)
  507. file_pos_write(f.file, pos);
  508. fdput_pos(f);
  509. }
  510. return ret;
  511. }
  512. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  513. size_t, count)
  514. {
  515. struct fd f = fdget_pos(fd);
  516. ssize_t ret = -EBADF;
  517. if (f.file) {
  518. loff_t pos = file_pos_read(f.file);
  519. ret = vfs_write(f.file, buf, count, &pos);
  520. if (ret >= 0)
  521. file_pos_write(f.file, pos);
  522. fdput_pos(f);
  523. }
  524. return ret;
  525. }
  526. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  527. size_t, count, loff_t, pos)
  528. {
  529. struct fd f;
  530. ssize_t ret = -EBADF;
  531. if (pos < 0)
  532. return -EINVAL;
  533. f = fdget(fd);
  534. if (f.file) {
  535. ret = -ESPIPE;
  536. if (f.file->f_mode & FMODE_PREAD)
  537. ret = vfs_read(f.file, buf, count, &pos);
  538. fdput(f);
  539. }
  540. return ret;
  541. }
  542. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  543. size_t, count, loff_t, pos)
  544. {
  545. struct fd f;
  546. ssize_t ret = -EBADF;
  547. if (pos < 0)
  548. return -EINVAL;
  549. f = fdget(fd);
  550. if (f.file) {
  551. ret = -ESPIPE;
  552. if (f.file->f_mode & FMODE_PWRITE)
  553. ret = vfs_write(f.file, buf, count, &pos);
  554. fdput(f);
  555. }
  556. return ret;
  557. }
  558. /*
  559. * Reduce an iovec's length in-place. Return the resulting number of segments
  560. */
  561. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  562. {
  563. unsigned long seg = 0;
  564. size_t len = 0;
  565. while (seg < nr_segs) {
  566. seg++;
  567. if (len + iov->iov_len >= to) {
  568. iov->iov_len = to - len;
  569. break;
  570. }
  571. len += iov->iov_len;
  572. iov++;
  573. }
  574. return seg;
  575. }
  576. EXPORT_SYMBOL(iov_shorten);
  577. static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
  578. loff_t *ppos, iter_fn_t fn)
  579. {
  580. struct kiocb kiocb;
  581. ssize_t ret;
  582. init_sync_kiocb(&kiocb, filp);
  583. kiocb.ki_pos = *ppos;
  584. ret = fn(&kiocb, iter);
  585. BUG_ON(ret == -EIOCBQUEUED);
  586. *ppos = kiocb.ki_pos;
  587. return ret;
  588. }
  589. /* Do it by hand, with file-ops */
  590. static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
  591. loff_t *ppos, io_fn_t fn)
  592. {
  593. ssize_t ret = 0;
  594. while (iov_iter_count(iter)) {
  595. struct iovec iovec = iov_iter_iovec(iter);
  596. ssize_t nr;
  597. nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
  598. if (nr < 0) {
  599. if (!ret)
  600. ret = nr;
  601. break;
  602. }
  603. ret += nr;
  604. if (nr != iovec.iov_len)
  605. break;
  606. iov_iter_advance(iter, nr);
  607. }
  608. return ret;
  609. }
  610. /* A write operation does a read from user space and vice versa */
  611. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  612. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  613. unsigned long nr_segs, unsigned long fast_segs,
  614. struct iovec *fast_pointer,
  615. struct iovec **ret_pointer)
  616. {
  617. unsigned long seg;
  618. ssize_t ret;
  619. struct iovec *iov = fast_pointer;
  620. /*
  621. * SuS says "The readv() function *may* fail if the iovcnt argument
  622. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  623. * traditionally returned zero for zero segments, so...
  624. */
  625. if (nr_segs == 0) {
  626. ret = 0;
  627. goto out;
  628. }
  629. /*
  630. * First get the "struct iovec" from user memory and
  631. * verify all the pointers
  632. */
  633. if (nr_segs > UIO_MAXIOV) {
  634. ret = -EINVAL;
  635. goto out;
  636. }
  637. if (nr_segs > fast_segs) {
  638. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  639. if (iov == NULL) {
  640. ret = -ENOMEM;
  641. goto out;
  642. }
  643. }
  644. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  645. ret = -EFAULT;
  646. goto out;
  647. }
  648. /*
  649. * According to the Single Unix Specification we should return EINVAL
  650. * if an element length is < 0 when cast to ssize_t or if the
  651. * total length would overflow the ssize_t return value of the
  652. * system call.
  653. *
  654. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  655. * overflow case.
  656. */
  657. ret = 0;
  658. for (seg = 0; seg < nr_segs; seg++) {
  659. void __user *buf = iov[seg].iov_base;
  660. ssize_t len = (ssize_t)iov[seg].iov_len;
  661. /* see if we we're about to use an invalid len or if
  662. * it's about to overflow ssize_t */
  663. if (len < 0) {
  664. ret = -EINVAL;
  665. goto out;
  666. }
  667. if (type >= 0
  668. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  669. ret = -EFAULT;
  670. goto out;
  671. }
  672. if (len > MAX_RW_COUNT - ret) {
  673. len = MAX_RW_COUNT - ret;
  674. iov[seg].iov_len = len;
  675. }
  676. ret += len;
  677. }
  678. out:
  679. *ret_pointer = iov;
  680. return ret;
  681. }
  682. static ssize_t do_readv_writev(int type, struct file *file,
  683. const struct iovec __user * uvector,
  684. unsigned long nr_segs, loff_t *pos)
  685. {
  686. size_t tot_len;
  687. struct iovec iovstack[UIO_FASTIOV];
  688. struct iovec *iov = iovstack;
  689. struct iov_iter iter;
  690. ssize_t ret;
  691. io_fn_t fn;
  692. iter_fn_t iter_fn;
  693. ret = import_iovec(type, uvector, nr_segs,
  694. ARRAY_SIZE(iovstack), &iov, &iter);
  695. if (ret < 0)
  696. return ret;
  697. tot_len = iov_iter_count(&iter);
  698. if (!tot_len)
  699. goto out;
  700. ret = rw_verify_area(type, file, pos, tot_len);
  701. if (ret < 0)
  702. goto out;
  703. if (type == READ) {
  704. fn = file->f_op->read;
  705. iter_fn = file->f_op->read_iter;
  706. } else {
  707. fn = (io_fn_t)file->f_op->write;
  708. iter_fn = file->f_op->write_iter;
  709. file_start_write(file);
  710. }
  711. if (iter_fn)
  712. ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
  713. else
  714. ret = do_loop_readv_writev(file, &iter, pos, fn);
  715. if (type != READ)
  716. file_end_write(file);
  717. out:
  718. kfree(iov);
  719. if ((ret + (type == READ)) > 0) {
  720. if (type == READ)
  721. fsnotify_access(file);
  722. else
  723. fsnotify_modify(file);
  724. }
  725. return ret;
  726. }
  727. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  728. unsigned long vlen, loff_t *pos)
  729. {
  730. if (!(file->f_mode & FMODE_READ))
  731. return -EBADF;
  732. if (!(file->f_mode & FMODE_CAN_READ))
  733. return -EINVAL;
  734. return do_readv_writev(READ, file, vec, vlen, pos);
  735. }
  736. EXPORT_SYMBOL(vfs_readv);
  737. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  738. unsigned long vlen, loff_t *pos)
  739. {
  740. if (!(file->f_mode & FMODE_WRITE))
  741. return -EBADF;
  742. if (!(file->f_mode & FMODE_CAN_WRITE))
  743. return -EINVAL;
  744. return do_readv_writev(WRITE, file, vec, vlen, pos);
  745. }
  746. EXPORT_SYMBOL(vfs_writev);
  747. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  748. unsigned long, vlen)
  749. {
  750. struct fd f = fdget_pos(fd);
  751. ssize_t ret = -EBADF;
  752. if (f.file) {
  753. loff_t pos = file_pos_read(f.file);
  754. ret = vfs_readv(f.file, vec, vlen, &pos);
  755. if (ret >= 0)
  756. file_pos_write(f.file, pos);
  757. fdput_pos(f);
  758. }
  759. if (ret > 0)
  760. add_rchar(current, ret);
  761. inc_syscr(current);
  762. return ret;
  763. }
  764. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  765. unsigned long, vlen)
  766. {
  767. struct fd f = fdget_pos(fd);
  768. ssize_t ret = -EBADF;
  769. if (f.file) {
  770. loff_t pos = file_pos_read(f.file);
  771. ret = vfs_writev(f.file, vec, vlen, &pos);
  772. if (ret >= 0)
  773. file_pos_write(f.file, pos);
  774. fdput_pos(f);
  775. }
  776. if (ret > 0)
  777. add_wchar(current, ret);
  778. inc_syscw(current);
  779. return ret;
  780. }
  781. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  782. {
  783. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  784. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  785. }
  786. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  787. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  788. {
  789. loff_t pos = pos_from_hilo(pos_h, pos_l);
  790. struct fd f;
  791. ssize_t ret = -EBADF;
  792. if (pos < 0)
  793. return -EINVAL;
  794. f = fdget(fd);
  795. if (f.file) {
  796. ret = -ESPIPE;
  797. if (f.file->f_mode & FMODE_PREAD)
  798. ret = vfs_readv(f.file, vec, vlen, &pos);
  799. fdput(f);
  800. }
  801. if (ret > 0)
  802. add_rchar(current, ret);
  803. inc_syscr(current);
  804. return ret;
  805. }
  806. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  807. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  808. {
  809. loff_t pos = pos_from_hilo(pos_h, pos_l);
  810. struct fd f;
  811. ssize_t ret = -EBADF;
  812. if (pos < 0)
  813. return -EINVAL;
  814. f = fdget(fd);
  815. if (f.file) {
  816. ret = -ESPIPE;
  817. if (f.file->f_mode & FMODE_PWRITE)
  818. ret = vfs_writev(f.file, vec, vlen, &pos);
  819. fdput(f);
  820. }
  821. if (ret > 0)
  822. add_wchar(current, ret);
  823. inc_syscw(current);
  824. return ret;
  825. }
  826. #ifdef CONFIG_COMPAT
  827. static ssize_t compat_do_readv_writev(int type, struct file *file,
  828. const struct compat_iovec __user *uvector,
  829. unsigned long nr_segs, loff_t *pos)
  830. {
  831. compat_ssize_t tot_len;
  832. struct iovec iovstack[UIO_FASTIOV];
  833. struct iovec *iov = iovstack;
  834. struct iov_iter iter;
  835. ssize_t ret;
  836. io_fn_t fn;
  837. iter_fn_t iter_fn;
  838. ret = compat_import_iovec(type, uvector, nr_segs,
  839. UIO_FASTIOV, &iov, &iter);
  840. if (ret < 0)
  841. return ret;
  842. tot_len = iov_iter_count(&iter);
  843. if (!tot_len)
  844. goto out;
  845. ret = rw_verify_area(type, file, pos, tot_len);
  846. if (ret < 0)
  847. goto out;
  848. if (type == READ) {
  849. fn = file->f_op->read;
  850. iter_fn = file->f_op->read_iter;
  851. } else {
  852. fn = (io_fn_t)file->f_op->write;
  853. iter_fn = file->f_op->write_iter;
  854. file_start_write(file);
  855. }
  856. if (iter_fn)
  857. ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
  858. else
  859. ret = do_loop_readv_writev(file, &iter, pos, fn);
  860. if (type != READ)
  861. file_end_write(file);
  862. out:
  863. kfree(iov);
  864. if ((ret + (type == READ)) > 0) {
  865. if (type == READ)
  866. fsnotify_access(file);
  867. else
  868. fsnotify_modify(file);
  869. }
  870. return ret;
  871. }
  872. static size_t compat_readv(struct file *file,
  873. const struct compat_iovec __user *vec,
  874. unsigned long vlen, loff_t *pos)
  875. {
  876. ssize_t ret = -EBADF;
  877. if (!(file->f_mode & FMODE_READ))
  878. goto out;
  879. ret = -EINVAL;
  880. if (!(file->f_mode & FMODE_CAN_READ))
  881. goto out;
  882. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  883. out:
  884. if (ret > 0)
  885. add_rchar(current, ret);
  886. inc_syscr(current);
  887. return ret;
  888. }
  889. COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
  890. const struct compat_iovec __user *,vec,
  891. compat_ulong_t, vlen)
  892. {
  893. struct fd f = fdget_pos(fd);
  894. ssize_t ret;
  895. loff_t pos;
  896. if (!f.file)
  897. return -EBADF;
  898. pos = f.file->f_pos;
  899. ret = compat_readv(f.file, vec, vlen, &pos);
  900. if (ret >= 0)
  901. f.file->f_pos = pos;
  902. fdput_pos(f);
  903. return ret;
  904. }
  905. static long __compat_sys_preadv64(unsigned long fd,
  906. const struct compat_iovec __user *vec,
  907. unsigned long vlen, loff_t pos)
  908. {
  909. struct fd f;
  910. ssize_t ret;
  911. if (pos < 0)
  912. return -EINVAL;
  913. f = fdget(fd);
  914. if (!f.file)
  915. return -EBADF;
  916. ret = -ESPIPE;
  917. if (f.file->f_mode & FMODE_PREAD)
  918. ret = compat_readv(f.file, vec, vlen, &pos);
  919. fdput(f);
  920. return ret;
  921. }
  922. #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
  923. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  924. const struct compat_iovec __user *,vec,
  925. unsigned long, vlen, loff_t, pos)
  926. {
  927. return __compat_sys_preadv64(fd, vec, vlen, pos);
  928. }
  929. #endif
  930. COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
  931. const struct compat_iovec __user *,vec,
  932. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  933. {
  934. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  935. return __compat_sys_preadv64(fd, vec, vlen, pos);
  936. }
  937. static size_t compat_writev(struct file *file,
  938. const struct compat_iovec __user *vec,
  939. unsigned long vlen, loff_t *pos)
  940. {
  941. ssize_t ret = -EBADF;
  942. if (!(file->f_mode & FMODE_WRITE))
  943. goto out;
  944. ret = -EINVAL;
  945. if (!(file->f_mode & FMODE_CAN_WRITE))
  946. goto out;
  947. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  948. out:
  949. if (ret > 0)
  950. add_wchar(current, ret);
  951. inc_syscw(current);
  952. return ret;
  953. }
  954. COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
  955. const struct compat_iovec __user *, vec,
  956. compat_ulong_t, vlen)
  957. {
  958. struct fd f = fdget_pos(fd);
  959. ssize_t ret;
  960. loff_t pos;
  961. if (!f.file)
  962. return -EBADF;
  963. pos = f.file->f_pos;
  964. ret = compat_writev(f.file, vec, vlen, &pos);
  965. if (ret >= 0)
  966. f.file->f_pos = pos;
  967. fdput_pos(f);
  968. return ret;
  969. }
  970. static long __compat_sys_pwritev64(unsigned long fd,
  971. const struct compat_iovec __user *vec,
  972. unsigned long vlen, loff_t pos)
  973. {
  974. struct fd f;
  975. ssize_t ret;
  976. if (pos < 0)
  977. return -EINVAL;
  978. f = fdget(fd);
  979. if (!f.file)
  980. return -EBADF;
  981. ret = -ESPIPE;
  982. if (f.file->f_mode & FMODE_PWRITE)
  983. ret = compat_writev(f.file, vec, vlen, &pos);
  984. fdput(f);
  985. return ret;
  986. }
  987. #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
  988. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  989. const struct compat_iovec __user *,vec,
  990. unsigned long, vlen, loff_t, pos)
  991. {
  992. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  993. }
  994. #endif
  995. COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
  996. const struct compat_iovec __user *,vec,
  997. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  998. {
  999. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  1000. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  1001. }
  1002. #endif
  1003. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  1004. size_t count, loff_t max)
  1005. {
  1006. struct fd in, out;
  1007. struct inode *in_inode, *out_inode;
  1008. loff_t pos;
  1009. loff_t out_pos;
  1010. ssize_t retval;
  1011. int fl;
  1012. /*
  1013. * Get input file, and verify that it is ok..
  1014. */
  1015. retval = -EBADF;
  1016. in = fdget(in_fd);
  1017. if (!in.file)
  1018. goto out;
  1019. if (!(in.file->f_mode & FMODE_READ))
  1020. goto fput_in;
  1021. retval = -ESPIPE;
  1022. if (!ppos) {
  1023. pos = in.file->f_pos;
  1024. } else {
  1025. pos = *ppos;
  1026. if (!(in.file->f_mode & FMODE_PREAD))
  1027. goto fput_in;
  1028. }
  1029. retval = rw_verify_area(READ, in.file, &pos, count);
  1030. if (retval < 0)
  1031. goto fput_in;
  1032. count = retval;
  1033. /*
  1034. * Get output file, and verify that it is ok..
  1035. */
  1036. retval = -EBADF;
  1037. out = fdget(out_fd);
  1038. if (!out.file)
  1039. goto fput_in;
  1040. if (!(out.file->f_mode & FMODE_WRITE))
  1041. goto fput_out;
  1042. retval = -EINVAL;
  1043. in_inode = file_inode(in.file);
  1044. out_inode = file_inode(out.file);
  1045. out_pos = out.file->f_pos;
  1046. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  1047. if (retval < 0)
  1048. goto fput_out;
  1049. count = retval;
  1050. if (!max)
  1051. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  1052. if (unlikely(pos + count > max)) {
  1053. retval = -EOVERFLOW;
  1054. if (pos >= max)
  1055. goto fput_out;
  1056. count = max - pos;
  1057. }
  1058. fl = 0;
  1059. #if 0
  1060. /*
  1061. * We need to debate whether we can enable this or not. The
  1062. * man page documents EAGAIN return for the output at least,
  1063. * and the application is arguably buggy if it doesn't expect
  1064. * EAGAIN on a non-blocking file descriptor.
  1065. */
  1066. if (in.file->f_flags & O_NONBLOCK)
  1067. fl = SPLICE_F_NONBLOCK;
  1068. #endif
  1069. file_start_write(out.file);
  1070. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  1071. file_end_write(out.file);
  1072. if (retval > 0) {
  1073. add_rchar(current, retval);
  1074. add_wchar(current, retval);
  1075. fsnotify_access(in.file);
  1076. fsnotify_modify(out.file);
  1077. out.file->f_pos = out_pos;
  1078. if (ppos)
  1079. *ppos = pos;
  1080. else
  1081. in.file->f_pos = pos;
  1082. }
  1083. inc_syscr(current);
  1084. inc_syscw(current);
  1085. if (pos > max)
  1086. retval = -EOVERFLOW;
  1087. fput_out:
  1088. fdput(out);
  1089. fput_in:
  1090. fdput(in);
  1091. out:
  1092. return retval;
  1093. }
  1094. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1095. {
  1096. loff_t pos;
  1097. off_t off;
  1098. ssize_t ret;
  1099. if (offset) {
  1100. if (unlikely(get_user(off, offset)))
  1101. return -EFAULT;
  1102. pos = off;
  1103. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1104. if (unlikely(put_user(pos, offset)))
  1105. return -EFAULT;
  1106. return ret;
  1107. }
  1108. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1109. }
  1110. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1111. {
  1112. loff_t pos;
  1113. ssize_t ret;
  1114. if (offset) {
  1115. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1116. return -EFAULT;
  1117. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1118. if (unlikely(put_user(pos, offset)))
  1119. return -EFAULT;
  1120. return ret;
  1121. }
  1122. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1123. }
  1124. #ifdef CONFIG_COMPAT
  1125. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1126. compat_off_t __user *, offset, compat_size_t, count)
  1127. {
  1128. loff_t pos;
  1129. off_t off;
  1130. ssize_t ret;
  1131. if (offset) {
  1132. if (unlikely(get_user(off, offset)))
  1133. return -EFAULT;
  1134. pos = off;
  1135. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1136. if (unlikely(put_user(pos, offset)))
  1137. return -EFAULT;
  1138. return ret;
  1139. }
  1140. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1141. }
  1142. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1143. compat_loff_t __user *, offset, compat_size_t, count)
  1144. {
  1145. loff_t pos;
  1146. ssize_t ret;
  1147. if (offset) {
  1148. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1149. return -EFAULT;
  1150. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1151. if (unlikely(put_user(pos, offset)))
  1152. return -EFAULT;
  1153. return ret;
  1154. }
  1155. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1156. }
  1157. #endif