vfio_iommu_type1.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053
  1. /*
  2. * VFIO: IOMMU DMA mapping support for Type1 IOMMU
  3. *
  4. * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  5. * Author: Alex Williamson <alex.williamson@redhat.com>
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License version 2 as
  9. * published by the Free Software Foundation.
  10. *
  11. * Derived from original vfio:
  12. * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  13. * Author: Tom Lyon, pugs@cisco.com
  14. *
  15. * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16. * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17. * VT-d, but that makes it harder to re-use as theoretically anyone
  18. * implementing a similar IOMMU could make use of this. We expect the
  19. * IOMMU to support the IOMMU API and have few to no restrictions around
  20. * the IOVA range that can be mapped. The Type1 IOMMU is currently
  21. * optimized for relatively static mappings of a userspace process with
  22. * userpsace pages pinned into memory. We also assume devices and IOMMU
  23. * domains are PCI based as the IOMMU API is still centered around a
  24. * device/bus interface rather than a group interface.
  25. */
  26. #include <linux/compat.h>
  27. #include <linux/device.h>
  28. #include <linux/fs.h>
  29. #include <linux/iommu.h>
  30. #include <linux/module.h>
  31. #include <linux/mm.h>
  32. #include <linux/rbtree.h>
  33. #include <linux/sched.h>
  34. #include <linux/slab.h>
  35. #include <linux/uaccess.h>
  36. #include <linux/vfio.h>
  37. #include <linux/workqueue.h>
  38. #define DRIVER_VERSION "0.2"
  39. #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
  40. #define DRIVER_DESC "Type1 IOMMU driver for VFIO"
  41. static bool allow_unsafe_interrupts;
  42. module_param_named(allow_unsafe_interrupts,
  43. allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  44. MODULE_PARM_DESC(allow_unsafe_interrupts,
  45. "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  46. static bool disable_hugepages;
  47. module_param_named(disable_hugepages,
  48. disable_hugepages, bool, S_IRUGO | S_IWUSR);
  49. MODULE_PARM_DESC(disable_hugepages,
  50. "Disable VFIO IOMMU support for IOMMU hugepages.");
  51. struct vfio_iommu {
  52. struct list_head domain_list;
  53. struct mutex lock;
  54. struct rb_root dma_list;
  55. bool v2;
  56. bool nesting;
  57. };
  58. struct vfio_domain {
  59. struct iommu_domain *domain;
  60. struct list_head next;
  61. struct list_head group_list;
  62. int prot; /* IOMMU_CACHE */
  63. bool fgsp; /* Fine-grained super pages */
  64. };
  65. struct vfio_dma {
  66. struct rb_node node;
  67. dma_addr_t iova; /* Device address */
  68. unsigned long vaddr; /* Process virtual addr */
  69. size_t size; /* Map size (bytes) */
  70. int prot; /* IOMMU_READ/WRITE */
  71. };
  72. struct vfio_group {
  73. struct iommu_group *iommu_group;
  74. struct list_head next;
  75. };
  76. /*
  77. * This code handles mapping and unmapping of user data buffers
  78. * into DMA'ble space using the IOMMU
  79. */
  80. static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  81. dma_addr_t start, size_t size)
  82. {
  83. struct rb_node *node = iommu->dma_list.rb_node;
  84. while (node) {
  85. struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  86. if (start + size <= dma->iova)
  87. node = node->rb_left;
  88. else if (start >= dma->iova + dma->size)
  89. node = node->rb_right;
  90. else
  91. return dma;
  92. }
  93. return NULL;
  94. }
  95. static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
  96. {
  97. struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
  98. struct vfio_dma *dma;
  99. while (*link) {
  100. parent = *link;
  101. dma = rb_entry(parent, struct vfio_dma, node);
  102. if (new->iova + new->size <= dma->iova)
  103. link = &(*link)->rb_left;
  104. else
  105. link = &(*link)->rb_right;
  106. }
  107. rb_link_node(&new->node, parent, link);
  108. rb_insert_color(&new->node, &iommu->dma_list);
  109. }
  110. static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
  111. {
  112. rb_erase(&old->node, &iommu->dma_list);
  113. }
  114. static int vfio_lock_acct(long npage, bool *lock_cap)
  115. {
  116. int ret = 0;
  117. if (!npage)
  118. return 0;
  119. if (!current->mm)
  120. return -ESRCH; /* process exited */
  121. down_write(&current->mm->mmap_sem);
  122. if (npage > 0) {
  123. if (lock_cap ? !*lock_cap : !capable(CAP_IPC_LOCK)) {
  124. unsigned long limit;
  125. limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  126. if (current->mm->locked_vm + npage > limit)
  127. ret = -ENOMEM;
  128. }
  129. }
  130. if (!ret)
  131. current->mm->locked_vm += npage;
  132. up_write(&current->mm->mmap_sem);
  133. return ret;
  134. }
  135. /*
  136. * Some mappings aren't backed by a struct page, for example an mmap'd
  137. * MMIO range for our own or another device. These use a different
  138. * pfn conversion and shouldn't be tracked as locked pages.
  139. */
  140. static bool is_invalid_reserved_pfn(unsigned long pfn)
  141. {
  142. if (pfn_valid(pfn)) {
  143. bool reserved;
  144. struct page *tail = pfn_to_page(pfn);
  145. struct page *head = compound_head(tail);
  146. reserved = !!(PageReserved(head));
  147. if (head != tail) {
  148. /*
  149. * "head" is not a dangling pointer
  150. * (compound_head takes care of that)
  151. * but the hugepage may have been split
  152. * from under us (and we may not hold a
  153. * reference count on the head page so it can
  154. * be reused before we run PageReferenced), so
  155. * we've to check PageTail before returning
  156. * what we just read.
  157. */
  158. smp_rmb();
  159. if (PageTail(tail))
  160. return reserved;
  161. }
  162. return PageReserved(tail);
  163. }
  164. return true;
  165. }
  166. static int put_pfn(unsigned long pfn, int prot)
  167. {
  168. if (!is_invalid_reserved_pfn(pfn)) {
  169. struct page *page = pfn_to_page(pfn);
  170. if (prot & IOMMU_WRITE)
  171. SetPageDirty(page);
  172. put_page(page);
  173. return 1;
  174. }
  175. return 0;
  176. }
  177. static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  178. {
  179. struct page *page[1];
  180. struct vm_area_struct *vma;
  181. int ret = -EFAULT;
  182. if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
  183. *pfn = page_to_pfn(page[0]);
  184. return 0;
  185. }
  186. down_read(&current->mm->mmap_sem);
  187. vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
  188. if (vma && vma->vm_flags & VM_PFNMAP) {
  189. *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  190. if (is_invalid_reserved_pfn(*pfn))
  191. ret = 0;
  192. }
  193. up_read(&current->mm->mmap_sem);
  194. return ret;
  195. }
  196. /*
  197. * Attempt to pin pages. We really don't want to track all the pfns and
  198. * the iommu can only map chunks of consecutive pfns anyway, so get the
  199. * first page and all consecutive pages with the same locking.
  200. */
  201. static long vfio_pin_pages(unsigned long vaddr, long npage,
  202. int prot, unsigned long *pfn_base)
  203. {
  204. unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  205. bool lock_cap = capable(CAP_IPC_LOCK);
  206. long ret, i = 1;
  207. bool rsvd;
  208. if (!current->mm)
  209. return -ENODEV;
  210. ret = vaddr_get_pfn(vaddr, prot, pfn_base);
  211. if (ret)
  212. return ret;
  213. rsvd = is_invalid_reserved_pfn(*pfn_base);
  214. if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
  215. put_pfn(*pfn_base, prot);
  216. pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
  217. limit << PAGE_SHIFT);
  218. return -ENOMEM;
  219. }
  220. if (unlikely(disable_hugepages))
  221. goto out;
  222. /* Lock all the consecutive pages from pfn_base */
  223. for (vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
  224. ret = vaddr_get_pfn(vaddr, prot, &pfn);
  225. if (ret)
  226. break;
  227. if (pfn != *pfn_base + i ||
  228. rsvd != is_invalid_reserved_pfn(pfn)) {
  229. put_pfn(pfn, prot);
  230. break;
  231. }
  232. if (!rsvd && !lock_cap &&
  233. current->mm->locked_vm + i + 1 > limit) {
  234. put_pfn(pfn, prot);
  235. pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
  236. __func__, limit << PAGE_SHIFT);
  237. ret = -ENOMEM;
  238. goto unpin_out;
  239. }
  240. }
  241. out:
  242. if (!rsvd)
  243. ret = vfio_lock_acct(i, &lock_cap);
  244. unpin_out:
  245. if (ret) {
  246. if (!rsvd) {
  247. for (pfn = *pfn_base ; i ; pfn++, i--)
  248. put_pfn(pfn, prot);
  249. }
  250. return ret;
  251. }
  252. return i;
  253. }
  254. static long vfio_unpin_pages(unsigned long pfn, long npage,
  255. int prot, bool do_accounting)
  256. {
  257. unsigned long unlocked = 0;
  258. long i;
  259. for (i = 0; i < npage; i++)
  260. unlocked += put_pfn(pfn++, prot);
  261. if (do_accounting)
  262. vfio_lock_acct(-unlocked, NULL);
  263. return unlocked;
  264. }
  265. static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
  266. {
  267. dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
  268. struct vfio_domain *domain, *d;
  269. long unlocked = 0;
  270. if (!dma->size)
  271. return;
  272. /*
  273. * We use the IOMMU to track the physical addresses, otherwise we'd
  274. * need a much more complicated tracking system. Unfortunately that
  275. * means we need to use one of the iommu domains to figure out the
  276. * pfns to unpin. The rest need to be unmapped in advance so we have
  277. * no iommu translations remaining when the pages are unpinned.
  278. */
  279. domain = d = list_first_entry(&iommu->domain_list,
  280. struct vfio_domain, next);
  281. list_for_each_entry_continue(d, &iommu->domain_list, next) {
  282. iommu_unmap(d->domain, dma->iova, dma->size);
  283. cond_resched();
  284. }
  285. while (iova < end) {
  286. size_t unmapped, len;
  287. phys_addr_t phys, next;
  288. phys = iommu_iova_to_phys(domain->domain, iova);
  289. if (WARN_ON(!phys)) {
  290. iova += PAGE_SIZE;
  291. continue;
  292. }
  293. /*
  294. * To optimize for fewer iommu_unmap() calls, each of which
  295. * may require hardware cache flushing, try to find the
  296. * largest contiguous physical memory chunk to unmap.
  297. */
  298. for (len = PAGE_SIZE;
  299. !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
  300. next = iommu_iova_to_phys(domain->domain, iova + len);
  301. if (next != phys + len)
  302. break;
  303. }
  304. unmapped = iommu_unmap(domain->domain, iova, len);
  305. if (WARN_ON(!unmapped))
  306. break;
  307. unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
  308. unmapped >> PAGE_SHIFT,
  309. dma->prot, false);
  310. iova += unmapped;
  311. cond_resched();
  312. }
  313. vfio_lock_acct(-unlocked, NULL);
  314. }
  315. static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
  316. {
  317. vfio_unmap_unpin(iommu, dma);
  318. vfio_unlink_dma(iommu, dma);
  319. kfree(dma);
  320. }
  321. static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
  322. {
  323. struct vfio_domain *domain;
  324. unsigned long bitmap = ULONG_MAX;
  325. mutex_lock(&iommu->lock);
  326. list_for_each_entry(domain, &iommu->domain_list, next)
  327. bitmap &= domain->domain->ops->pgsize_bitmap;
  328. mutex_unlock(&iommu->lock);
  329. /*
  330. * In case the IOMMU supports page sizes smaller than PAGE_SIZE
  331. * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
  332. * That way the user will be able to map/unmap buffers whose size/
  333. * start address is aligned with PAGE_SIZE. Pinning code uses that
  334. * granularity while iommu driver can use the sub-PAGE_SIZE size
  335. * to map the buffer.
  336. */
  337. if (bitmap & ~PAGE_MASK) {
  338. bitmap &= PAGE_MASK;
  339. bitmap |= PAGE_SIZE;
  340. }
  341. return bitmap;
  342. }
  343. static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
  344. struct vfio_iommu_type1_dma_unmap *unmap)
  345. {
  346. uint64_t mask;
  347. struct vfio_dma *dma;
  348. size_t unmapped = 0;
  349. int ret = 0;
  350. mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
  351. if (unmap->iova & mask)
  352. return -EINVAL;
  353. if (!unmap->size || unmap->size & mask)
  354. return -EINVAL;
  355. WARN_ON(mask & PAGE_MASK);
  356. mutex_lock(&iommu->lock);
  357. /*
  358. * vfio-iommu-type1 (v1) - User mappings were coalesced together to
  359. * avoid tracking individual mappings. This means that the granularity
  360. * of the original mapping was lost and the user was allowed to attempt
  361. * to unmap any range. Depending on the contiguousness of physical
  362. * memory and page sizes supported by the IOMMU, arbitrary unmaps may
  363. * or may not have worked. We only guaranteed unmap granularity
  364. * matching the original mapping; even though it was untracked here,
  365. * the original mappings are reflected in IOMMU mappings. This
  366. * resulted in a couple unusual behaviors. First, if a range is not
  367. * able to be unmapped, ex. a set of 4k pages that was mapped as a
  368. * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
  369. * a zero sized unmap. Also, if an unmap request overlaps the first
  370. * address of a hugepage, the IOMMU will unmap the entire hugepage.
  371. * This also returns success and the returned unmap size reflects the
  372. * actual size unmapped.
  373. *
  374. * We attempt to maintain compatibility with this "v1" interface, but
  375. * we take control out of the hands of the IOMMU. Therefore, an unmap
  376. * request offset from the beginning of the original mapping will
  377. * return success with zero sized unmap. And an unmap request covering
  378. * the first iova of mapping will unmap the entire range.
  379. *
  380. * The v2 version of this interface intends to be more deterministic.
  381. * Unmap requests must fully cover previous mappings. Multiple
  382. * mappings may still be unmaped by specifying large ranges, but there
  383. * must not be any previous mappings bisected by the range. An error
  384. * will be returned if these conditions are not met. The v2 interface
  385. * will only return success and a size of zero if there were no
  386. * mappings within the range.
  387. */
  388. if (iommu->v2) {
  389. dma = vfio_find_dma(iommu, unmap->iova, 0);
  390. if (dma && dma->iova != unmap->iova) {
  391. ret = -EINVAL;
  392. goto unlock;
  393. }
  394. dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
  395. if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
  396. ret = -EINVAL;
  397. goto unlock;
  398. }
  399. }
  400. while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
  401. if (!iommu->v2 && unmap->iova > dma->iova)
  402. break;
  403. unmapped += dma->size;
  404. vfio_remove_dma(iommu, dma);
  405. }
  406. unlock:
  407. mutex_unlock(&iommu->lock);
  408. /* Report how much was unmapped */
  409. unmap->size = unmapped;
  410. return ret;
  411. }
  412. /*
  413. * Turns out AMD IOMMU has a page table bug where it won't map large pages
  414. * to a region that previously mapped smaller pages. This should be fixed
  415. * soon, so this is just a temporary workaround to break mappings down into
  416. * PAGE_SIZE. Better to map smaller pages than nothing.
  417. */
  418. static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
  419. unsigned long pfn, long npage, int prot)
  420. {
  421. long i;
  422. int ret;
  423. for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
  424. ret = iommu_map(domain->domain, iova,
  425. (phys_addr_t)pfn << PAGE_SHIFT,
  426. PAGE_SIZE, prot | domain->prot);
  427. if (ret)
  428. break;
  429. }
  430. for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
  431. iommu_unmap(domain->domain, iova, PAGE_SIZE);
  432. return ret;
  433. }
  434. static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
  435. unsigned long pfn, long npage, int prot)
  436. {
  437. struct vfio_domain *d;
  438. int ret;
  439. list_for_each_entry(d, &iommu->domain_list, next) {
  440. ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
  441. npage << PAGE_SHIFT, prot | d->prot);
  442. if (ret) {
  443. if (ret != -EBUSY ||
  444. map_try_harder(d, iova, pfn, npage, prot))
  445. goto unwind;
  446. }
  447. cond_resched();
  448. }
  449. return 0;
  450. unwind:
  451. list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
  452. iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
  453. return ret;
  454. }
  455. static int vfio_dma_do_map(struct vfio_iommu *iommu,
  456. struct vfio_iommu_type1_dma_map *map)
  457. {
  458. dma_addr_t iova = map->iova;
  459. unsigned long vaddr = map->vaddr;
  460. size_t size = map->size;
  461. long npage;
  462. int ret = 0, prot = 0;
  463. uint64_t mask;
  464. struct vfio_dma *dma;
  465. unsigned long pfn;
  466. /* Verify that none of our __u64 fields overflow */
  467. if (map->size != size || map->vaddr != vaddr || map->iova != iova)
  468. return -EINVAL;
  469. mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
  470. WARN_ON(mask & PAGE_MASK);
  471. /* READ/WRITE from device perspective */
  472. if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
  473. prot |= IOMMU_WRITE;
  474. if (map->flags & VFIO_DMA_MAP_FLAG_READ)
  475. prot |= IOMMU_READ;
  476. if (!prot || !size || (size | iova | vaddr) & mask)
  477. return -EINVAL;
  478. /* Don't allow IOVA or virtual address wrap */
  479. if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
  480. return -EINVAL;
  481. mutex_lock(&iommu->lock);
  482. if (vfio_find_dma(iommu, iova, size)) {
  483. mutex_unlock(&iommu->lock);
  484. return -EEXIST;
  485. }
  486. dma = kzalloc(sizeof(*dma), GFP_KERNEL);
  487. if (!dma) {
  488. mutex_unlock(&iommu->lock);
  489. return -ENOMEM;
  490. }
  491. dma->iova = iova;
  492. dma->vaddr = vaddr;
  493. dma->prot = prot;
  494. /* Insert zero-sized and grow as we map chunks of it */
  495. vfio_link_dma(iommu, dma);
  496. while (size) {
  497. /* Pin a contiguous chunk of memory */
  498. npage = vfio_pin_pages(vaddr + dma->size,
  499. size >> PAGE_SHIFT, prot, &pfn);
  500. if (npage <= 0) {
  501. WARN_ON(!npage);
  502. ret = (int)npage;
  503. break;
  504. }
  505. /* Map it! */
  506. ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
  507. if (ret) {
  508. vfio_unpin_pages(pfn, npage, prot, true);
  509. break;
  510. }
  511. size -= npage << PAGE_SHIFT;
  512. dma->size += npage << PAGE_SHIFT;
  513. }
  514. if (ret)
  515. vfio_remove_dma(iommu, dma);
  516. mutex_unlock(&iommu->lock);
  517. return ret;
  518. }
  519. static int vfio_bus_type(struct device *dev, void *data)
  520. {
  521. struct bus_type **bus = data;
  522. if (*bus && *bus != dev->bus)
  523. return -EINVAL;
  524. *bus = dev->bus;
  525. return 0;
  526. }
  527. static int vfio_iommu_replay(struct vfio_iommu *iommu,
  528. struct vfio_domain *domain)
  529. {
  530. struct vfio_domain *d;
  531. struct rb_node *n;
  532. int ret;
  533. /* Arbitrarily pick the first domain in the list for lookups */
  534. d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
  535. n = rb_first(&iommu->dma_list);
  536. /* If there's not a domain, there better not be any mappings */
  537. if (WARN_ON(n && !d))
  538. return -EINVAL;
  539. for (; n; n = rb_next(n)) {
  540. struct vfio_dma *dma;
  541. dma_addr_t iova;
  542. dma = rb_entry(n, struct vfio_dma, node);
  543. iova = dma->iova;
  544. while (iova < dma->iova + dma->size) {
  545. phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
  546. size_t size;
  547. if (WARN_ON(!phys)) {
  548. iova += PAGE_SIZE;
  549. continue;
  550. }
  551. size = PAGE_SIZE;
  552. while (iova + size < dma->iova + dma->size &&
  553. phys + size == iommu_iova_to_phys(d->domain,
  554. iova + size))
  555. size += PAGE_SIZE;
  556. ret = iommu_map(domain->domain, iova, phys,
  557. size, dma->prot | domain->prot);
  558. if (ret)
  559. return ret;
  560. iova += size;
  561. }
  562. }
  563. return 0;
  564. }
  565. /*
  566. * We change our unmap behavior slightly depending on whether the IOMMU
  567. * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
  568. * for practically any contiguous power-of-two mapping we give it. This means
  569. * we don't need to look for contiguous chunks ourselves to make unmapping
  570. * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
  571. * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
  572. * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
  573. * hugetlbfs is in use.
  574. */
  575. static void vfio_test_domain_fgsp(struct vfio_domain *domain)
  576. {
  577. struct page *pages;
  578. int ret, order = get_order(PAGE_SIZE * 2);
  579. pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
  580. if (!pages)
  581. return;
  582. ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
  583. IOMMU_READ | IOMMU_WRITE | domain->prot);
  584. if (!ret) {
  585. size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
  586. if (unmapped == PAGE_SIZE)
  587. iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
  588. else
  589. domain->fgsp = true;
  590. }
  591. __free_pages(pages, order);
  592. }
  593. static int vfio_iommu_type1_attach_group(void *iommu_data,
  594. struct iommu_group *iommu_group)
  595. {
  596. struct vfio_iommu *iommu = iommu_data;
  597. struct vfio_group *group, *g;
  598. struct vfio_domain *domain, *d;
  599. struct bus_type *bus = NULL;
  600. int ret;
  601. mutex_lock(&iommu->lock);
  602. list_for_each_entry(d, &iommu->domain_list, next) {
  603. list_for_each_entry(g, &d->group_list, next) {
  604. if (g->iommu_group != iommu_group)
  605. continue;
  606. mutex_unlock(&iommu->lock);
  607. return -EINVAL;
  608. }
  609. }
  610. group = kzalloc(sizeof(*group), GFP_KERNEL);
  611. domain = kzalloc(sizeof(*domain), GFP_KERNEL);
  612. if (!group || !domain) {
  613. ret = -ENOMEM;
  614. goto out_free;
  615. }
  616. group->iommu_group = iommu_group;
  617. /* Determine bus_type in order to allocate a domain */
  618. ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
  619. if (ret)
  620. goto out_free;
  621. domain->domain = iommu_domain_alloc(bus);
  622. if (!domain->domain) {
  623. ret = -EIO;
  624. goto out_free;
  625. }
  626. if (iommu->nesting) {
  627. int attr = 1;
  628. ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
  629. &attr);
  630. if (ret)
  631. goto out_domain;
  632. }
  633. ret = iommu_attach_group(domain->domain, iommu_group);
  634. if (ret)
  635. goto out_domain;
  636. INIT_LIST_HEAD(&domain->group_list);
  637. list_add(&group->next, &domain->group_list);
  638. if (!allow_unsafe_interrupts &&
  639. !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
  640. pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
  641. __func__);
  642. ret = -EPERM;
  643. goto out_detach;
  644. }
  645. if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
  646. domain->prot |= IOMMU_CACHE;
  647. /*
  648. * Try to match an existing compatible domain. We don't want to
  649. * preclude an IOMMU driver supporting multiple bus_types and being
  650. * able to include different bus_types in the same IOMMU domain, so
  651. * we test whether the domains use the same iommu_ops rather than
  652. * testing if they're on the same bus_type.
  653. */
  654. list_for_each_entry(d, &iommu->domain_list, next) {
  655. if (d->domain->ops == domain->domain->ops &&
  656. d->prot == domain->prot) {
  657. iommu_detach_group(domain->domain, iommu_group);
  658. if (!iommu_attach_group(d->domain, iommu_group)) {
  659. list_add(&group->next, &d->group_list);
  660. iommu_domain_free(domain->domain);
  661. kfree(domain);
  662. mutex_unlock(&iommu->lock);
  663. return 0;
  664. }
  665. ret = iommu_attach_group(domain->domain, iommu_group);
  666. if (ret)
  667. goto out_domain;
  668. }
  669. }
  670. vfio_test_domain_fgsp(domain);
  671. /* replay mappings on new domains */
  672. ret = vfio_iommu_replay(iommu, domain);
  673. if (ret)
  674. goto out_detach;
  675. list_add(&domain->next, &iommu->domain_list);
  676. mutex_unlock(&iommu->lock);
  677. return 0;
  678. out_detach:
  679. iommu_detach_group(domain->domain, iommu_group);
  680. out_domain:
  681. iommu_domain_free(domain->domain);
  682. out_free:
  683. kfree(domain);
  684. kfree(group);
  685. mutex_unlock(&iommu->lock);
  686. return ret;
  687. }
  688. static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
  689. {
  690. struct rb_node *node;
  691. while ((node = rb_first(&iommu->dma_list)))
  692. vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
  693. }
  694. static void vfio_iommu_type1_detach_group(void *iommu_data,
  695. struct iommu_group *iommu_group)
  696. {
  697. struct vfio_iommu *iommu = iommu_data;
  698. struct vfio_domain *domain;
  699. struct vfio_group *group;
  700. mutex_lock(&iommu->lock);
  701. list_for_each_entry(domain, &iommu->domain_list, next) {
  702. list_for_each_entry(group, &domain->group_list, next) {
  703. if (group->iommu_group != iommu_group)
  704. continue;
  705. iommu_detach_group(domain->domain, iommu_group);
  706. list_del(&group->next);
  707. kfree(group);
  708. /*
  709. * Group ownership provides privilege, if the group
  710. * list is empty, the domain goes away. If it's the
  711. * last domain, then all the mappings go away too.
  712. */
  713. if (list_empty(&domain->group_list)) {
  714. if (list_is_singular(&iommu->domain_list))
  715. vfio_iommu_unmap_unpin_all(iommu);
  716. iommu_domain_free(domain->domain);
  717. list_del(&domain->next);
  718. kfree(domain);
  719. }
  720. goto done;
  721. }
  722. }
  723. done:
  724. mutex_unlock(&iommu->lock);
  725. }
  726. static void *vfio_iommu_type1_open(unsigned long arg)
  727. {
  728. struct vfio_iommu *iommu;
  729. iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
  730. if (!iommu)
  731. return ERR_PTR(-ENOMEM);
  732. switch (arg) {
  733. case VFIO_TYPE1_IOMMU:
  734. break;
  735. case VFIO_TYPE1_NESTING_IOMMU:
  736. iommu->nesting = true;
  737. case VFIO_TYPE1v2_IOMMU:
  738. iommu->v2 = true;
  739. break;
  740. default:
  741. kfree(iommu);
  742. return ERR_PTR(-EINVAL);
  743. }
  744. INIT_LIST_HEAD(&iommu->domain_list);
  745. iommu->dma_list = RB_ROOT;
  746. mutex_init(&iommu->lock);
  747. return iommu;
  748. }
  749. static void vfio_iommu_type1_release(void *iommu_data)
  750. {
  751. struct vfio_iommu *iommu = iommu_data;
  752. struct vfio_domain *domain, *domain_tmp;
  753. struct vfio_group *group, *group_tmp;
  754. vfio_iommu_unmap_unpin_all(iommu);
  755. list_for_each_entry_safe(domain, domain_tmp,
  756. &iommu->domain_list, next) {
  757. list_for_each_entry_safe(group, group_tmp,
  758. &domain->group_list, next) {
  759. iommu_detach_group(domain->domain, group->iommu_group);
  760. list_del(&group->next);
  761. kfree(group);
  762. }
  763. iommu_domain_free(domain->domain);
  764. list_del(&domain->next);
  765. kfree(domain);
  766. }
  767. kfree(iommu);
  768. }
  769. static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
  770. {
  771. struct vfio_domain *domain;
  772. int ret = 1;
  773. mutex_lock(&iommu->lock);
  774. list_for_each_entry(domain, &iommu->domain_list, next) {
  775. if (!(domain->prot & IOMMU_CACHE)) {
  776. ret = 0;
  777. break;
  778. }
  779. }
  780. mutex_unlock(&iommu->lock);
  781. return ret;
  782. }
  783. static long vfio_iommu_type1_ioctl(void *iommu_data,
  784. unsigned int cmd, unsigned long arg)
  785. {
  786. struct vfio_iommu *iommu = iommu_data;
  787. unsigned long minsz;
  788. if (cmd == VFIO_CHECK_EXTENSION) {
  789. switch (arg) {
  790. case VFIO_TYPE1_IOMMU:
  791. case VFIO_TYPE1v2_IOMMU:
  792. case VFIO_TYPE1_NESTING_IOMMU:
  793. return 1;
  794. case VFIO_DMA_CC_IOMMU:
  795. if (!iommu)
  796. return 0;
  797. return vfio_domains_have_iommu_cache(iommu);
  798. default:
  799. return 0;
  800. }
  801. } else if (cmd == VFIO_IOMMU_GET_INFO) {
  802. struct vfio_iommu_type1_info info;
  803. minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
  804. if (copy_from_user(&info, (void __user *)arg, minsz))
  805. return -EFAULT;
  806. if (info.argsz < minsz)
  807. return -EINVAL;
  808. info.flags = 0;
  809. info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
  810. return copy_to_user((void __user *)arg, &info, minsz) ?
  811. -EFAULT : 0;
  812. } else if (cmd == VFIO_IOMMU_MAP_DMA) {
  813. struct vfio_iommu_type1_dma_map map;
  814. uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
  815. VFIO_DMA_MAP_FLAG_WRITE;
  816. minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
  817. if (copy_from_user(&map, (void __user *)arg, minsz))
  818. return -EFAULT;
  819. if (map.argsz < minsz || map.flags & ~mask)
  820. return -EINVAL;
  821. return vfio_dma_do_map(iommu, &map);
  822. } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
  823. struct vfio_iommu_type1_dma_unmap unmap;
  824. long ret;
  825. minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
  826. if (copy_from_user(&unmap, (void __user *)arg, minsz))
  827. return -EFAULT;
  828. if (unmap.argsz < minsz || unmap.flags)
  829. return -EINVAL;
  830. ret = vfio_dma_do_unmap(iommu, &unmap);
  831. if (ret)
  832. return ret;
  833. return copy_to_user((void __user *)arg, &unmap, minsz) ?
  834. -EFAULT : 0;
  835. }
  836. return -ENOTTY;
  837. }
  838. static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
  839. .name = "vfio-iommu-type1",
  840. .owner = THIS_MODULE,
  841. .open = vfio_iommu_type1_open,
  842. .release = vfio_iommu_type1_release,
  843. .ioctl = vfio_iommu_type1_ioctl,
  844. .attach_group = vfio_iommu_type1_attach_group,
  845. .detach_group = vfio_iommu_type1_detach_group,
  846. };
  847. static int __init vfio_iommu_type1_init(void)
  848. {
  849. return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
  850. }
  851. static void __exit vfio_iommu_type1_cleanup(void)
  852. {
  853. vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
  854. }
  855. module_init(vfio_iommu_type1_init);
  856. module_exit(vfio_iommu_type1_cleanup);
  857. MODULE_VERSION(DRIVER_VERSION);
  858. MODULE_LICENSE("GPL v2");
  859. MODULE_AUTHOR(DRIVER_AUTHOR);
  860. MODULE_DESCRIPTION(DRIVER_DESC);