pgtable_64.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869
  1. /*
  2. * This file contains ioremap and related functions for 64-bit machines.
  3. *
  4. * Derived from arch/ppc64/mm/init.c
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
  8. * and Cort Dougan (PReP) (cort@cs.nmt.edu)
  9. * Copyright (C) 1996 Paul Mackerras
  10. *
  11. * Derived from "arch/i386/mm/init.c"
  12. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  13. *
  14. * Dave Engebretsen <engebret@us.ibm.com>
  15. * Rework for PPC64 port.
  16. *
  17. * This program is free software; you can redistribute it and/or
  18. * modify it under the terms of the GNU General Public License
  19. * as published by the Free Software Foundation; either version
  20. * 2 of the License, or (at your option) any later version.
  21. *
  22. */
  23. #include <linux/signal.h>
  24. #include <linux/sched.h>
  25. #include <linux/kernel.h>
  26. #include <linux/errno.h>
  27. #include <linux/string.h>
  28. #include <linux/export.h>
  29. #include <linux/types.h>
  30. #include <linux/mman.h>
  31. #include <linux/mm.h>
  32. #include <linux/swap.h>
  33. #include <linux/stddef.h>
  34. #include <linux/vmalloc.h>
  35. #include <linux/memblock.h>
  36. #include <linux/slab.h>
  37. #include <linux/hugetlb.h>
  38. #include <asm/pgalloc.h>
  39. #include <asm/page.h>
  40. #include <asm/prom.h>
  41. #include <asm/io.h>
  42. #include <asm/mmu_context.h>
  43. #include <asm/pgtable.h>
  44. #include <asm/mmu.h>
  45. #include <asm/smp.h>
  46. #include <asm/machdep.h>
  47. #include <asm/tlb.h>
  48. #include <asm/processor.h>
  49. #include <asm/cputable.h>
  50. #include <asm/sections.h>
  51. #include <asm/firmware.h>
  52. #include <asm/dma.h>
  53. #include "mmu_decl.h"
  54. #define CREATE_TRACE_POINTS
  55. #include <trace/events/thp.h>
  56. /* Some sanity checking */
  57. #if TASK_SIZE_USER64 > PGTABLE_RANGE
  58. #error TASK_SIZE_USER64 exceeds pagetable range
  59. #endif
  60. #ifdef CONFIG_PPC_STD_MMU_64
  61. #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
  62. #error TASK_SIZE_USER64 exceeds user VSID range
  63. #endif
  64. #endif
  65. unsigned long ioremap_bot = IOREMAP_BASE;
  66. #ifdef CONFIG_PPC_MMU_NOHASH
  67. static __ref void *early_alloc_pgtable(unsigned long size)
  68. {
  69. void *pt;
  70. pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
  71. memset(pt, 0, size);
  72. return pt;
  73. }
  74. #endif /* CONFIG_PPC_MMU_NOHASH */
  75. /*
  76. * map_kernel_page currently only called by __ioremap
  77. * map_kernel_page adds an entry to the ioremap page table
  78. * and adds an entry to the HPT, possibly bolting it
  79. */
  80. int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
  81. {
  82. pgd_t *pgdp;
  83. pud_t *pudp;
  84. pmd_t *pmdp;
  85. pte_t *ptep;
  86. if (slab_is_available()) {
  87. pgdp = pgd_offset_k(ea);
  88. pudp = pud_alloc(&init_mm, pgdp, ea);
  89. if (!pudp)
  90. return -ENOMEM;
  91. pmdp = pmd_alloc(&init_mm, pudp, ea);
  92. if (!pmdp)
  93. return -ENOMEM;
  94. ptep = pte_alloc_kernel(pmdp, ea);
  95. if (!ptep)
  96. return -ENOMEM;
  97. set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
  98. __pgprot(flags)));
  99. } else {
  100. #ifdef CONFIG_PPC_MMU_NOHASH
  101. pgdp = pgd_offset_k(ea);
  102. #ifdef PUD_TABLE_SIZE
  103. if (pgd_none(*pgdp)) {
  104. pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
  105. BUG_ON(pudp == NULL);
  106. pgd_populate(&init_mm, pgdp, pudp);
  107. }
  108. #endif /* PUD_TABLE_SIZE */
  109. pudp = pud_offset(pgdp, ea);
  110. if (pud_none(*pudp)) {
  111. pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
  112. BUG_ON(pmdp == NULL);
  113. pud_populate(&init_mm, pudp, pmdp);
  114. }
  115. pmdp = pmd_offset(pudp, ea);
  116. if (!pmd_present(*pmdp)) {
  117. ptep = early_alloc_pgtable(PAGE_SIZE);
  118. BUG_ON(ptep == NULL);
  119. pmd_populate_kernel(&init_mm, pmdp, ptep);
  120. }
  121. ptep = pte_offset_kernel(pmdp, ea);
  122. set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
  123. __pgprot(flags)));
  124. #else /* CONFIG_PPC_MMU_NOHASH */
  125. /*
  126. * If the mm subsystem is not fully up, we cannot create a
  127. * linux page table entry for this mapping. Simply bolt an
  128. * entry in the hardware page table.
  129. *
  130. */
  131. if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
  132. mmu_io_psize, mmu_kernel_ssize)) {
  133. printk(KERN_ERR "Failed to do bolted mapping IO "
  134. "memory at %016lx !\n", pa);
  135. return -ENOMEM;
  136. }
  137. #endif /* !CONFIG_PPC_MMU_NOHASH */
  138. }
  139. smp_wmb();
  140. return 0;
  141. }
  142. /**
  143. * __ioremap_at - Low level function to establish the page tables
  144. * for an IO mapping
  145. */
  146. void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
  147. unsigned long flags)
  148. {
  149. unsigned long i;
  150. /* Make sure we have the base flags */
  151. if ((flags & _PAGE_PRESENT) == 0)
  152. flags |= pgprot_val(PAGE_KERNEL);
  153. /* Non-cacheable page cannot be coherent */
  154. if (flags & _PAGE_NO_CACHE)
  155. flags &= ~_PAGE_COHERENT;
  156. /* We don't support the 4K PFN hack with ioremap */
  157. if (flags & _PAGE_4K_PFN)
  158. return NULL;
  159. WARN_ON(pa & ~PAGE_MASK);
  160. WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
  161. WARN_ON(size & ~PAGE_MASK);
  162. for (i = 0; i < size; i += PAGE_SIZE)
  163. if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
  164. return NULL;
  165. return (void __iomem *)ea;
  166. }
  167. /**
  168. * __iounmap_from - Low level function to tear down the page tables
  169. * for an IO mapping. This is used for mappings that
  170. * are manipulated manually, like partial unmapping of
  171. * PCI IOs or ISA space.
  172. */
  173. void __iounmap_at(void *ea, unsigned long size)
  174. {
  175. WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
  176. WARN_ON(size & ~PAGE_MASK);
  177. unmap_kernel_range((unsigned long)ea, size);
  178. }
  179. void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
  180. unsigned long flags, void *caller)
  181. {
  182. phys_addr_t paligned;
  183. void __iomem *ret;
  184. /*
  185. * Choose an address to map it to.
  186. * Once the imalloc system is running, we use it.
  187. * Before that, we map using addresses going
  188. * up from ioremap_bot. imalloc will use
  189. * the addresses from ioremap_bot through
  190. * IMALLOC_END
  191. *
  192. */
  193. paligned = addr & PAGE_MASK;
  194. size = PAGE_ALIGN(addr + size) - paligned;
  195. if ((size == 0) || (paligned == 0))
  196. return NULL;
  197. if (slab_is_available()) {
  198. struct vm_struct *area;
  199. area = __get_vm_area_caller(size, VM_IOREMAP,
  200. ioremap_bot, IOREMAP_END,
  201. caller);
  202. if (area == NULL)
  203. return NULL;
  204. area->phys_addr = paligned;
  205. ret = __ioremap_at(paligned, area->addr, size, flags);
  206. if (!ret)
  207. vunmap(area->addr);
  208. } else {
  209. ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
  210. if (ret)
  211. ioremap_bot += size;
  212. }
  213. if (ret)
  214. ret += addr & ~PAGE_MASK;
  215. return ret;
  216. }
  217. void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
  218. unsigned long flags)
  219. {
  220. return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
  221. }
  222. void __iomem * ioremap(phys_addr_t addr, unsigned long size)
  223. {
  224. unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
  225. void *caller = __builtin_return_address(0);
  226. if (ppc_md.ioremap)
  227. return ppc_md.ioremap(addr, size, flags, caller);
  228. return __ioremap_caller(addr, size, flags, caller);
  229. }
  230. void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
  231. {
  232. unsigned long flags = _PAGE_NO_CACHE;
  233. void *caller = __builtin_return_address(0);
  234. if (ppc_md.ioremap)
  235. return ppc_md.ioremap(addr, size, flags, caller);
  236. return __ioremap_caller(addr, size, flags, caller);
  237. }
  238. void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
  239. unsigned long flags)
  240. {
  241. void *caller = __builtin_return_address(0);
  242. /* writeable implies dirty for kernel addresses */
  243. if (flags & _PAGE_RW)
  244. flags |= _PAGE_DIRTY;
  245. /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
  246. flags &= ~(_PAGE_USER | _PAGE_EXEC);
  247. #ifdef _PAGE_BAP_SR
  248. /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
  249. * which means that we just cleared supervisor access... oops ;-) This
  250. * restores it
  251. */
  252. flags |= _PAGE_BAP_SR;
  253. #endif
  254. if (ppc_md.ioremap)
  255. return ppc_md.ioremap(addr, size, flags, caller);
  256. return __ioremap_caller(addr, size, flags, caller);
  257. }
  258. /*
  259. * Unmap an IO region and remove it from imalloc'd list.
  260. * Access to IO memory should be serialized by driver.
  261. */
  262. void __iounmap(volatile void __iomem *token)
  263. {
  264. void *addr;
  265. if (!slab_is_available())
  266. return;
  267. addr = (void *) ((unsigned long __force)
  268. PCI_FIX_ADDR(token) & PAGE_MASK);
  269. if ((unsigned long)addr < ioremap_bot) {
  270. printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
  271. " at 0x%p\n", addr);
  272. return;
  273. }
  274. vunmap(addr);
  275. }
  276. void iounmap(volatile void __iomem *token)
  277. {
  278. if (ppc_md.iounmap)
  279. ppc_md.iounmap(token);
  280. else
  281. __iounmap(token);
  282. }
  283. EXPORT_SYMBOL(ioremap);
  284. EXPORT_SYMBOL(ioremap_wc);
  285. EXPORT_SYMBOL(ioremap_prot);
  286. EXPORT_SYMBOL(__ioremap);
  287. EXPORT_SYMBOL(__ioremap_at);
  288. EXPORT_SYMBOL(iounmap);
  289. EXPORT_SYMBOL(__iounmap);
  290. EXPORT_SYMBOL(__iounmap_at);
  291. #ifndef __PAGETABLE_PUD_FOLDED
  292. /* 4 level page table */
  293. struct page *pgd_page(pgd_t pgd)
  294. {
  295. if (pgd_huge(pgd))
  296. return pte_page(pgd_pte(pgd));
  297. return virt_to_page(pgd_page_vaddr(pgd));
  298. }
  299. #endif
  300. struct page *pud_page(pud_t pud)
  301. {
  302. if (pud_huge(pud))
  303. return pte_page(pud_pte(pud));
  304. return virt_to_page(pud_page_vaddr(pud));
  305. }
  306. /*
  307. * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
  308. * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
  309. */
  310. struct page *pmd_page(pmd_t pmd)
  311. {
  312. if (pmd_trans_huge(pmd) || pmd_huge(pmd))
  313. return pfn_to_page(pmd_pfn(pmd));
  314. return virt_to_page(pmd_page_vaddr(pmd));
  315. }
  316. #ifdef CONFIG_PPC_64K_PAGES
  317. static pte_t *get_from_cache(struct mm_struct *mm)
  318. {
  319. void *pte_frag, *ret;
  320. spin_lock(&mm->page_table_lock);
  321. ret = mm->context.pte_frag;
  322. if (ret) {
  323. pte_frag = ret + PTE_FRAG_SIZE;
  324. /*
  325. * If we have taken up all the fragments mark PTE page NULL
  326. */
  327. if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
  328. pte_frag = NULL;
  329. mm->context.pte_frag = pte_frag;
  330. }
  331. spin_unlock(&mm->page_table_lock);
  332. return (pte_t *)ret;
  333. }
  334. static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
  335. {
  336. void *ret = NULL;
  337. struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
  338. __GFP_REPEAT | __GFP_ZERO);
  339. if (!page)
  340. return NULL;
  341. if (!kernel && !pgtable_page_ctor(page)) {
  342. __free_page(page);
  343. return NULL;
  344. }
  345. ret = page_address(page);
  346. spin_lock(&mm->page_table_lock);
  347. /*
  348. * If we find pgtable_page set, we return
  349. * the allocated page with single fragement
  350. * count.
  351. */
  352. if (likely(!mm->context.pte_frag)) {
  353. atomic_set(&page->_count, PTE_FRAG_NR);
  354. mm->context.pte_frag = ret + PTE_FRAG_SIZE;
  355. }
  356. spin_unlock(&mm->page_table_lock);
  357. return (pte_t *)ret;
  358. }
  359. pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
  360. {
  361. pte_t *pte;
  362. pte = get_from_cache(mm);
  363. if (pte)
  364. return pte;
  365. return __alloc_for_cache(mm, kernel);
  366. }
  367. void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
  368. {
  369. struct page *page = virt_to_page(table);
  370. if (put_page_testzero(page)) {
  371. if (!kernel)
  372. pgtable_page_dtor(page);
  373. free_hot_cold_page(page, 0);
  374. }
  375. }
  376. #ifdef CONFIG_SMP
  377. static void page_table_free_rcu(void *table)
  378. {
  379. struct page *page = virt_to_page(table);
  380. if (put_page_testzero(page)) {
  381. pgtable_page_dtor(page);
  382. free_hot_cold_page(page, 0);
  383. }
  384. }
  385. void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  386. {
  387. unsigned long pgf = (unsigned long)table;
  388. BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  389. pgf |= shift;
  390. tlb_remove_table(tlb, (void *)pgf);
  391. }
  392. void __tlb_remove_table(void *_table)
  393. {
  394. void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
  395. unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
  396. if (!shift)
  397. /* PTE page needs special handling */
  398. page_table_free_rcu(table);
  399. else {
  400. BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  401. kmem_cache_free(PGT_CACHE(shift), table);
  402. }
  403. }
  404. #else
  405. void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  406. {
  407. if (!shift) {
  408. /* PTE page needs special handling */
  409. struct page *page = virt_to_page(table);
  410. if (put_page_testzero(page)) {
  411. pgtable_page_dtor(page);
  412. free_hot_cold_page(page, 0);
  413. }
  414. } else {
  415. BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  416. kmem_cache_free(PGT_CACHE(shift), table);
  417. }
  418. }
  419. #endif
  420. #endif /* CONFIG_PPC_64K_PAGES */
  421. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  422. /*
  423. * This is called when relaxing access to a hugepage. It's also called in the page
  424. * fault path when we don't hit any of the major fault cases, ie, a minor
  425. * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
  426. * handled those two for us, we additionally deal with missing execute
  427. * permission here on some processors
  428. */
  429. int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
  430. pmd_t *pmdp, pmd_t entry, int dirty)
  431. {
  432. int changed;
  433. #ifdef CONFIG_DEBUG_VM
  434. WARN_ON(!pmd_trans_huge(*pmdp));
  435. assert_spin_locked(&vma->vm_mm->page_table_lock);
  436. #endif
  437. changed = !pmd_same(*(pmdp), entry);
  438. if (changed) {
  439. __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
  440. /*
  441. * Since we are not supporting SW TLB systems, we don't
  442. * have any thing similar to flush_tlb_page_nohash()
  443. */
  444. }
  445. return changed;
  446. }
  447. unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
  448. pmd_t *pmdp, unsigned long clr,
  449. unsigned long set)
  450. {
  451. unsigned long old, tmp;
  452. #ifdef CONFIG_DEBUG_VM
  453. WARN_ON(!pmd_trans_huge(*pmdp));
  454. assert_spin_locked(&mm->page_table_lock);
  455. #endif
  456. #ifdef PTE_ATOMIC_UPDATES
  457. __asm__ __volatile__(
  458. "1: ldarx %0,0,%3\n\
  459. andi. %1,%0,%6\n\
  460. bne- 1b \n\
  461. andc %1,%0,%4 \n\
  462. or %1,%1,%7\n\
  463. stdcx. %1,0,%3 \n\
  464. bne- 1b"
  465. : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
  466. : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
  467. : "cc" );
  468. #else
  469. old = pmd_val(*pmdp);
  470. *pmdp = __pmd((old & ~clr) | set);
  471. #endif
  472. trace_hugepage_update(addr, old, clr, set);
  473. if (old & _PAGE_HASHPTE)
  474. hpte_do_hugepage_flush(mm, addr, pmdp, old);
  475. return old;
  476. }
  477. pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
  478. pmd_t *pmdp)
  479. {
  480. pmd_t pmd;
  481. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  482. VM_BUG_ON(pmd_trans_huge(*pmdp));
  483. pmd = *pmdp;
  484. pmd_clear(pmdp);
  485. /*
  486. * Wait for all pending hash_page to finish. This is needed
  487. * in case of subpage collapse. When we collapse normal pages
  488. * to hugepage, we first clear the pmd, then invalidate all
  489. * the PTE entries. The assumption here is that any low level
  490. * page fault will see a none pmd and take the slow path that
  491. * will wait on mmap_sem. But we could very well be in a
  492. * hash_page with local ptep pointer value. Such a hash page
  493. * can result in adding new HPTE entries for normal subpages.
  494. * That means we could be modifying the page content as we
  495. * copy them to a huge page. So wait for parallel hash_page
  496. * to finish before invalidating HPTE entries. We can do this
  497. * by sending an IPI to all the cpus and executing a dummy
  498. * function there.
  499. */
  500. kick_all_cpus_sync();
  501. /*
  502. * Now invalidate the hpte entries in the range
  503. * covered by pmd. This make sure we take a
  504. * fault and will find the pmd as none, which will
  505. * result in a major fault which takes mmap_sem and
  506. * hence wait for collapse to complete. Without this
  507. * the __collapse_huge_page_copy can result in copying
  508. * the old content.
  509. */
  510. flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
  511. return pmd;
  512. }
  513. int pmdp_test_and_clear_young(struct vm_area_struct *vma,
  514. unsigned long address, pmd_t *pmdp)
  515. {
  516. return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
  517. }
  518. /*
  519. * We currently remove entries from the hashtable regardless of whether
  520. * the entry was young or dirty. The generic routines only flush if the
  521. * entry was young or dirty which is not good enough.
  522. *
  523. * We should be more intelligent about this but for the moment we override
  524. * these functions and force a tlb flush unconditionally
  525. */
  526. int pmdp_clear_flush_young(struct vm_area_struct *vma,
  527. unsigned long address, pmd_t *pmdp)
  528. {
  529. return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
  530. }
  531. /*
  532. * We mark the pmd splitting and invalidate all the hpte
  533. * entries for this hugepage.
  534. */
  535. void pmdp_splitting_flush(struct vm_area_struct *vma,
  536. unsigned long address, pmd_t *pmdp)
  537. {
  538. unsigned long old, tmp;
  539. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  540. #ifdef CONFIG_DEBUG_VM
  541. WARN_ON(!pmd_trans_huge(*pmdp));
  542. assert_spin_locked(&vma->vm_mm->page_table_lock);
  543. #endif
  544. #ifdef PTE_ATOMIC_UPDATES
  545. __asm__ __volatile__(
  546. "1: ldarx %0,0,%3\n\
  547. andi. %1,%0,%6\n\
  548. bne- 1b \n\
  549. ori %1,%0,%4 \n\
  550. stdcx. %1,0,%3 \n\
  551. bne- 1b"
  552. : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
  553. : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
  554. : "cc" );
  555. #else
  556. old = pmd_val(*pmdp);
  557. *pmdp = __pmd(old | _PAGE_SPLITTING);
  558. #endif
  559. /*
  560. * If we didn't had the splitting flag set, go and flush the
  561. * HPTE entries.
  562. */
  563. trace_hugepage_splitting(address, old);
  564. if (!(old & _PAGE_SPLITTING)) {
  565. /* We need to flush the hpte */
  566. if (old & _PAGE_HASHPTE)
  567. hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
  568. }
  569. /*
  570. * This ensures that generic code that rely on IRQ disabling
  571. * to prevent a parallel THP split work as expected.
  572. */
  573. kick_all_cpus_sync();
  574. }
  575. /*
  576. * We want to put the pgtable in pmd and use pgtable for tracking
  577. * the base page size hptes
  578. */
  579. void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  580. pgtable_t pgtable)
  581. {
  582. pgtable_t *pgtable_slot;
  583. assert_spin_locked(&mm->page_table_lock);
  584. /*
  585. * we store the pgtable in the second half of PMD
  586. */
  587. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  588. *pgtable_slot = pgtable;
  589. /*
  590. * expose the deposited pgtable to other cpus.
  591. * before we set the hugepage PTE at pmd level
  592. * hash fault code looks at the deposted pgtable
  593. * to store hash index values.
  594. */
  595. smp_wmb();
  596. }
  597. pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  598. {
  599. pgtable_t pgtable;
  600. pgtable_t *pgtable_slot;
  601. assert_spin_locked(&mm->page_table_lock);
  602. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  603. pgtable = *pgtable_slot;
  604. /*
  605. * Once we withdraw, mark the entry NULL.
  606. */
  607. *pgtable_slot = NULL;
  608. /*
  609. * We store HPTE information in the deposited PTE fragment.
  610. * zero out the content on withdraw.
  611. */
  612. memset(pgtable, 0, PTE_FRAG_SIZE);
  613. return pgtable;
  614. }
  615. /*
  616. * set a new huge pmd. We should not be called for updating
  617. * an existing pmd entry. That should go via pmd_hugepage_update.
  618. */
  619. void set_pmd_at(struct mm_struct *mm, unsigned long addr,
  620. pmd_t *pmdp, pmd_t pmd)
  621. {
  622. #ifdef CONFIG_DEBUG_VM
  623. WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
  624. (_PAGE_PRESENT | _PAGE_USER));
  625. assert_spin_locked(&mm->page_table_lock);
  626. WARN_ON(!pmd_trans_huge(pmd));
  627. #endif
  628. trace_hugepage_set_pmd(addr, pmd_val(pmd));
  629. return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
  630. }
  631. void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  632. pmd_t *pmdp)
  633. {
  634. pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
  635. }
  636. /*
  637. * A linux hugepage PMD was changed and the corresponding hash table entries
  638. * neesd to be flushed.
  639. */
  640. void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
  641. pmd_t *pmdp, unsigned long old_pmd)
  642. {
  643. int ssize;
  644. unsigned int psize;
  645. unsigned long vsid;
  646. unsigned long flags = 0;
  647. const struct cpumask *tmp;
  648. /* get the base page size,vsid and segment size */
  649. #ifdef CONFIG_DEBUG_VM
  650. psize = get_slice_psize(mm, addr);
  651. BUG_ON(psize == MMU_PAGE_16M);
  652. #endif
  653. if (old_pmd & _PAGE_COMBO)
  654. psize = MMU_PAGE_4K;
  655. else
  656. psize = MMU_PAGE_64K;
  657. if (!is_kernel_addr(addr)) {
  658. ssize = user_segment_size(addr);
  659. vsid = get_vsid(mm->context.id, addr, ssize);
  660. WARN_ON(vsid == 0);
  661. } else {
  662. vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
  663. ssize = mmu_kernel_ssize;
  664. }
  665. tmp = cpumask_of(smp_processor_id());
  666. if (cpumask_equal(mm_cpumask(mm), tmp))
  667. flags |= HPTE_LOCAL_UPDATE;
  668. return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
  669. }
  670. static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
  671. {
  672. pmd_val(pmd) |= pgprot_val(pgprot);
  673. return pmd;
  674. }
  675. pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
  676. {
  677. pmd_t pmd;
  678. /*
  679. * For a valid pte, we would have _PAGE_PRESENT always
  680. * set. We use this to check THP page at pmd level.
  681. * leaf pte for huge page, bottom two bits != 00
  682. */
  683. pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
  684. pmd_val(pmd) |= _PAGE_THP_HUGE;
  685. pmd = pmd_set_protbits(pmd, pgprot);
  686. return pmd;
  687. }
  688. pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
  689. {
  690. return pfn_pmd(page_to_pfn(page), pgprot);
  691. }
  692. pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
  693. {
  694. pmd_val(pmd) &= _HPAGE_CHG_MASK;
  695. pmd = pmd_set_protbits(pmd, newprot);
  696. return pmd;
  697. }
  698. /*
  699. * This is called at the end of handling a user page fault, when the
  700. * fault has been handled by updating a HUGE PMD entry in the linux page tables.
  701. * We use it to preload an HPTE into the hash table corresponding to
  702. * the updated linux HUGE PMD entry.
  703. */
  704. void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
  705. pmd_t *pmd)
  706. {
  707. return;
  708. }
  709. pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
  710. unsigned long addr, pmd_t *pmdp)
  711. {
  712. pmd_t old_pmd;
  713. pgtable_t pgtable;
  714. unsigned long old;
  715. pgtable_t *pgtable_slot;
  716. old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
  717. old_pmd = __pmd(old);
  718. /*
  719. * We have pmd == none and we are holding page_table_lock.
  720. * So we can safely go and clear the pgtable hash
  721. * index info.
  722. */
  723. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  724. pgtable = *pgtable_slot;
  725. /*
  726. * Let's zero out old valid and hash index details
  727. * hash fault look at them.
  728. */
  729. memset(pgtable, 0, PTE_FRAG_SIZE);
  730. /*
  731. * Serialize against find_linux_pte_or_hugepte which does lock-less
  732. * lookup in page tables with local interrupts disabled. For huge pages
  733. * it casts pmd_t to pte_t. Since format of pte_t is different from
  734. * pmd_t we want to prevent transit from pmd pointing to page table
  735. * to pmd pointing to huge page (and back) while interrupts are disabled.
  736. * We clear pmd to possibly replace it with page table pointer in
  737. * different code paths. So make sure we wait for the parallel
  738. * find_linux_pte_or_hugepage to finish.
  739. */
  740. kick_all_cpus_sync();
  741. return old_pmd;
  742. }
  743. int has_transparent_hugepage(void)
  744. {
  745. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  746. return 0;
  747. /*
  748. * We support THP only if PMD_SIZE is 16MB.
  749. */
  750. if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
  751. return 0;
  752. /*
  753. * We need to make sure that we support 16MB hugepage in a segement
  754. * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
  755. * of 64K.
  756. */
  757. /*
  758. * If we have 64K HPTE, we will be using that by default
  759. */
  760. if (mmu_psize_defs[MMU_PAGE_64K].shift &&
  761. (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
  762. return 0;
  763. /*
  764. * Ok we only have 4K HPTE
  765. */
  766. if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
  767. return 0;
  768. return 1;
  769. }
  770. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */