eventfd.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. /*
  2. * fs/eventfd.c
  3. *
  4. * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
  5. *
  6. */
  7. #include <linux/file.h>
  8. #include <linux/poll.h>
  9. #include <linux/init.h>
  10. #include <linux/fs.h>
  11. #include <linux/sched.h>
  12. #include <linux/kernel.h>
  13. #include <linux/slab.h>
  14. #include <linux/list.h>
  15. #include <linux/spinlock.h>
  16. #include <linux/anon_inodes.h>
  17. #include <linux/syscalls.h>
  18. #include <linux/export.h>
  19. #include <linux/kref.h>
  20. #include <linux/eventfd.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/seq_file.h>
  23. struct eventfd_ctx {
  24. struct kref kref;
  25. wait_queue_head_t wqh;
  26. /*
  27. * Every time that a write(2) is performed on an eventfd, the
  28. * value of the __u64 being written is added to "count" and a
  29. * wakeup is performed on "wqh". A read(2) will return the "count"
  30. * value to userspace, and will reset "count" to zero. The kernel
  31. * side eventfd_signal() also, adds to the "count" counter and
  32. * issue a wakeup.
  33. */
  34. __u64 count;
  35. unsigned int flags;
  36. };
  37. /**
  38. * eventfd_signal - Adds @n to the eventfd counter.
  39. * @ctx: [in] Pointer to the eventfd context.
  40. * @n: [in] Value of the counter to be added to the eventfd internal counter.
  41. * The value cannot be negative.
  42. *
  43. * This function is supposed to be called by the kernel in paths that do not
  44. * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  45. * value, and we signal this as overflow condition by returining a POLLERR
  46. * to poll(2).
  47. *
  48. * Returns the amount by which the counter was incrememnted. This will be less
  49. * than @n if the counter has overflowed.
  50. */
  51. __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  52. {
  53. unsigned long flags;
  54. spin_lock_irqsave(&ctx->wqh.lock, flags);
  55. if (ULLONG_MAX - ctx->count < n)
  56. n = ULLONG_MAX - ctx->count;
  57. ctx->count += n;
  58. if (waitqueue_active(&ctx->wqh))
  59. wake_up_locked_poll(&ctx->wqh, POLLIN);
  60. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  61. return n;
  62. }
  63. EXPORT_SYMBOL_GPL(eventfd_signal);
  64. static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  65. {
  66. kfree(ctx);
  67. }
  68. static void eventfd_free(struct kref *kref)
  69. {
  70. struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  71. eventfd_free_ctx(ctx);
  72. }
  73. /**
  74. * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
  75. * @ctx: [in] Pointer to the eventfd context.
  76. *
  77. * Returns: In case of success, returns a pointer to the eventfd context.
  78. */
  79. struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
  80. {
  81. kref_get(&ctx->kref);
  82. return ctx;
  83. }
  84. EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  85. /**
  86. * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  87. * @ctx: [in] Pointer to eventfd context.
  88. *
  89. * The eventfd context reference must have been previously acquired either
  90. * with eventfd_ctx_get() or eventfd_ctx_fdget().
  91. */
  92. void eventfd_ctx_put(struct eventfd_ctx *ctx)
  93. {
  94. kref_put(&ctx->kref, eventfd_free);
  95. }
  96. EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  97. static int eventfd_release(struct inode *inode, struct file *file)
  98. {
  99. struct eventfd_ctx *ctx = file->private_data;
  100. wake_up_poll(&ctx->wqh, POLLHUP);
  101. eventfd_ctx_put(ctx);
  102. return 0;
  103. }
  104. static unsigned int eventfd_poll(struct file *file, poll_table *wait)
  105. {
  106. struct eventfd_ctx *ctx = file->private_data;
  107. unsigned int events = 0;
  108. u64 count;
  109. poll_wait(file, &ctx->wqh, wait);
  110. smp_rmb();
  111. count = ctx->count;
  112. if (count > 0)
  113. events |= POLLIN;
  114. if (count == ULLONG_MAX)
  115. events |= POLLERR;
  116. if (ULLONG_MAX - 1 > count)
  117. events |= POLLOUT;
  118. return events;
  119. }
  120. static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  121. {
  122. *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
  123. ctx->count -= *cnt;
  124. }
  125. /**
  126. * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
  127. * @ctx: [in] Pointer to eventfd context.
  128. * @wait: [in] Wait queue to be removed.
  129. * @cnt: [out] Pointer to the 64-bit counter value.
  130. *
  131. * Returns %0 if successful, or the following error codes:
  132. *
  133. * -EAGAIN : The operation would have blocked.
  134. *
  135. * This is used to atomically remove a wait queue entry from the eventfd wait
  136. * queue head, and read/reset the counter value.
  137. */
  138. int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
  139. __u64 *cnt)
  140. {
  141. unsigned long flags;
  142. spin_lock_irqsave(&ctx->wqh.lock, flags);
  143. eventfd_ctx_do_read(ctx, cnt);
  144. __remove_wait_queue(&ctx->wqh, wait);
  145. if (*cnt != 0 && waitqueue_active(&ctx->wqh))
  146. wake_up_locked_poll(&ctx->wqh, POLLOUT);
  147. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  148. return *cnt != 0 ? 0 : -EAGAIN;
  149. }
  150. EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  151. /**
  152. * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
  153. * @ctx: [in] Pointer to eventfd context.
  154. * @no_wait: [in] Different from zero if the operation should not block.
  155. * @cnt: [out] Pointer to the 64-bit counter value.
  156. *
  157. * Returns %0 if successful, or the following error codes:
  158. *
  159. * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
  160. * -ERESTARTSYS : A signal interrupted the wait operation.
  161. *
  162. * If @no_wait is zero, the function might sleep until the eventfd internal
  163. * counter becomes greater than zero.
  164. */
  165. ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
  166. {
  167. ssize_t res;
  168. DECLARE_WAITQUEUE(wait, current);
  169. spin_lock_irq(&ctx->wqh.lock);
  170. *cnt = 0;
  171. res = -EAGAIN;
  172. if (ctx->count > 0)
  173. res = 0;
  174. else if (!no_wait) {
  175. __add_wait_queue(&ctx->wqh, &wait);
  176. for (;;) {
  177. set_current_state(TASK_INTERRUPTIBLE);
  178. if (ctx->count > 0) {
  179. res = 0;
  180. break;
  181. }
  182. if (signal_pending(current)) {
  183. res = -ERESTARTSYS;
  184. break;
  185. }
  186. spin_unlock_irq(&ctx->wqh.lock);
  187. schedule();
  188. spin_lock_irq(&ctx->wqh.lock);
  189. }
  190. __remove_wait_queue(&ctx->wqh, &wait);
  191. __set_current_state(TASK_RUNNING);
  192. }
  193. if (likely(res == 0)) {
  194. eventfd_ctx_do_read(ctx, cnt);
  195. if (waitqueue_active(&ctx->wqh))
  196. wake_up_locked_poll(&ctx->wqh, POLLOUT);
  197. }
  198. spin_unlock_irq(&ctx->wqh.lock);
  199. return res;
  200. }
  201. EXPORT_SYMBOL_GPL(eventfd_ctx_read);
  202. static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
  203. loff_t *ppos)
  204. {
  205. struct eventfd_ctx *ctx = file->private_data;
  206. ssize_t res;
  207. __u64 cnt;
  208. if (count < sizeof(cnt))
  209. return -EINVAL;
  210. res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
  211. if (res < 0)
  212. return res;
  213. return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
  214. }
  215. static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
  216. loff_t *ppos)
  217. {
  218. struct eventfd_ctx *ctx = file->private_data;
  219. ssize_t res;
  220. __u64 ucnt;
  221. DECLARE_WAITQUEUE(wait, current);
  222. if (count < sizeof(ucnt))
  223. return -EINVAL;
  224. if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
  225. return -EFAULT;
  226. if (ucnt == ULLONG_MAX)
  227. return -EINVAL;
  228. spin_lock_irq(&ctx->wqh.lock);
  229. res = -EAGAIN;
  230. if (ULLONG_MAX - ctx->count > ucnt)
  231. res = sizeof(ucnt);
  232. else if (!(file->f_flags & O_NONBLOCK)) {
  233. __add_wait_queue(&ctx->wqh, &wait);
  234. for (res = 0;;) {
  235. set_current_state(TASK_INTERRUPTIBLE);
  236. if (ULLONG_MAX - ctx->count > ucnt) {
  237. res = sizeof(ucnt);
  238. break;
  239. }
  240. if (signal_pending(current)) {
  241. res = -ERESTARTSYS;
  242. break;
  243. }
  244. spin_unlock_irq(&ctx->wqh.lock);
  245. schedule();
  246. spin_lock_irq(&ctx->wqh.lock);
  247. }
  248. __remove_wait_queue(&ctx->wqh, &wait);
  249. __set_current_state(TASK_RUNNING);
  250. }
  251. if (likely(res > 0)) {
  252. ctx->count += ucnt;
  253. if (waitqueue_active(&ctx->wqh))
  254. wake_up_locked_poll(&ctx->wqh, POLLIN);
  255. }
  256. spin_unlock_irq(&ctx->wqh.lock);
  257. return res;
  258. }
  259. #ifdef CONFIG_PROC_FS
  260. static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
  261. {
  262. struct eventfd_ctx *ctx = f->private_data;
  263. spin_lock_irq(&ctx->wqh.lock);
  264. seq_printf(m, "eventfd-count: %16llx\n",
  265. (unsigned long long)ctx->count);
  266. spin_unlock_irq(&ctx->wqh.lock);
  267. }
  268. #endif
  269. static const struct file_operations eventfd_fops = {
  270. #ifdef CONFIG_PROC_FS
  271. .show_fdinfo = eventfd_show_fdinfo,
  272. #endif
  273. .release = eventfd_release,
  274. .poll = eventfd_poll,
  275. .read = eventfd_read,
  276. .write = eventfd_write,
  277. .llseek = noop_llseek,
  278. };
  279. /**
  280. * eventfd_fget - Acquire a reference of an eventfd file descriptor.
  281. * @fd: [in] Eventfd file descriptor.
  282. *
  283. * Returns a pointer to the eventfd file structure in case of success, or the
  284. * following error pointer:
  285. *
  286. * -EBADF : Invalid @fd file descriptor.
  287. * -EINVAL : The @fd file descriptor is not an eventfd file.
  288. */
  289. struct file *eventfd_fget(int fd)
  290. {
  291. struct file *file;
  292. file = fget(fd);
  293. if (!file)
  294. return ERR_PTR(-EBADF);
  295. if (file->f_op != &eventfd_fops) {
  296. fput(file);
  297. return ERR_PTR(-EINVAL);
  298. }
  299. return file;
  300. }
  301. EXPORT_SYMBOL_GPL(eventfd_fget);
  302. /**
  303. * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
  304. * @fd: [in] Eventfd file descriptor.
  305. *
  306. * Returns a pointer to the internal eventfd context, otherwise the error
  307. * pointers returned by the following functions:
  308. *
  309. * eventfd_fget
  310. */
  311. struct eventfd_ctx *eventfd_ctx_fdget(int fd)
  312. {
  313. struct eventfd_ctx *ctx;
  314. struct fd f = fdget(fd);
  315. if (!f.file)
  316. return ERR_PTR(-EBADF);
  317. ctx = eventfd_ctx_fileget(f.file);
  318. fdput(f);
  319. return ctx;
  320. }
  321. EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
  322. /**
  323. * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
  324. * @file: [in] Eventfd file pointer.
  325. *
  326. * Returns a pointer to the internal eventfd context, otherwise the error
  327. * pointer:
  328. *
  329. * -EINVAL : The @fd file descriptor is not an eventfd file.
  330. */
  331. struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
  332. {
  333. if (file->f_op != &eventfd_fops)
  334. return ERR_PTR(-EINVAL);
  335. return eventfd_ctx_get(file->private_data);
  336. }
  337. EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
  338. /**
  339. * eventfd_file_create - Creates an eventfd file pointer.
  340. * @count: Initial eventfd counter value.
  341. * @flags: Flags for the eventfd file.
  342. *
  343. * This function creates an eventfd file pointer, w/out installing it into
  344. * the fd table. This is useful when the eventfd file is used during the
  345. * initialization of data structures that require extra setup after the eventfd
  346. * creation. So the eventfd creation is split into the file pointer creation
  347. * phase, and the file descriptor installation phase.
  348. * In this way races with userspace closing the newly installed file descriptor
  349. * can be avoided.
  350. * Returns an eventfd file pointer, or a proper error pointer.
  351. */
  352. struct file *eventfd_file_create(unsigned int count, int flags)
  353. {
  354. struct file *file;
  355. struct eventfd_ctx *ctx;
  356. /* Check the EFD_* constants for consistency. */
  357. BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
  358. BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
  359. if (flags & ~EFD_FLAGS_SET)
  360. return ERR_PTR(-EINVAL);
  361. ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
  362. if (!ctx)
  363. return ERR_PTR(-ENOMEM);
  364. kref_init(&ctx->kref);
  365. init_waitqueue_head(&ctx->wqh);
  366. ctx->count = count;
  367. ctx->flags = flags;
  368. file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
  369. O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
  370. if (IS_ERR(file))
  371. eventfd_free_ctx(ctx);
  372. return file;
  373. }
  374. SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
  375. {
  376. int fd, error;
  377. struct file *file;
  378. error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
  379. if (error < 0)
  380. return error;
  381. fd = error;
  382. file = eventfd_file_create(count, flags);
  383. if (IS_ERR(file)) {
  384. error = PTR_ERR(file);
  385. goto err_put_unused_fd;
  386. }
  387. fd_install(fd, file);
  388. return fd;
  389. err_put_unused_fd:
  390. put_unused_fd(fd);
  391. return error;
  392. }
  393. SYSCALL_DEFINE1(eventfd, unsigned int, count)
  394. {
  395. return sys_eventfd2(count, 0);
  396. }