kaiser.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. #include <linux/bug.h>
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/string.h>
  5. #include <linux/types.h>
  6. #include <linux/bug.h>
  7. #include <linux/init.h>
  8. #include <linux/interrupt.h>
  9. #include <linux/spinlock.h>
  10. #include <linux/mm.h>
  11. #include <linux/uaccess.h>
  12. #include <linux/ftrace.h>
  13. #undef pr_fmt
  14. #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
  15. #include <asm/kaiser.h>
  16. #include <asm/tlbflush.h> /* to verify its kaiser declarations */
  17. #include <asm/pgtable.h>
  18. #include <asm/pgalloc.h>
  19. #include <asm/desc.h>
  20. #include <asm/cmdline.h>
  21. #include <asm/vsyscall.h>
  22. int kaiser_enabled __read_mostly = 1;
  23. EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
  24. __visible
  25. DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  26. /*
  27. * These can have bit 63 set, so we can not just use a plain "or"
  28. * instruction to get their value or'd into CR3. It would take
  29. * another register. So, we use a memory reference to these instead.
  30. *
  31. * This is also handy because systems that do not support PCIDs
  32. * just end up or'ing a 0 into their CR3, which does no harm.
  33. */
  34. DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
  35. /*
  36. * At runtime, the only things we map are some things for CPU
  37. * hotplug, and stacks for new processes. No two CPUs will ever
  38. * be populating the same addresses, so we only need to ensure
  39. * that we protect between two CPUs trying to allocate and
  40. * populate the same page table page.
  41. *
  42. * Only take this lock when doing a set_p[4um]d(), but it is not
  43. * needed for doing a set_pte(). We assume that only the *owner*
  44. * of a given allocation will be doing this for _their_
  45. * allocation.
  46. *
  47. * This ensures that once a system has been running for a while
  48. * and there have been stacks all over and these page tables
  49. * are fully populated, there will be no further acquisitions of
  50. * this lock.
  51. */
  52. static DEFINE_SPINLOCK(shadow_table_allocation_lock);
  53. /*
  54. * Returns -1 on error.
  55. */
  56. static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  57. {
  58. pgd_t *pgd;
  59. pud_t *pud;
  60. pmd_t *pmd;
  61. pte_t *pte;
  62. pgd = pgd_offset_k(vaddr);
  63. /*
  64. * We made all the kernel PGDs present in kaiser_init().
  65. * We expect them to stay that way.
  66. */
  67. BUG_ON(pgd_none(*pgd));
  68. /*
  69. * PGDs are either 512GB or 128TB on all x86_64
  70. * configurations. We don't handle these.
  71. */
  72. BUG_ON(pgd_large(*pgd));
  73. pud = pud_offset(pgd, vaddr);
  74. if (pud_none(*pud)) {
  75. WARN_ON_ONCE(1);
  76. return -1;
  77. }
  78. if (pud_large(*pud))
  79. return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
  80. pmd = pmd_offset(pud, vaddr);
  81. if (pmd_none(*pmd)) {
  82. WARN_ON_ONCE(1);
  83. return -1;
  84. }
  85. if (pmd_large(*pmd))
  86. return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
  87. pte = pte_offset_kernel(pmd, vaddr);
  88. if (pte_none(*pte)) {
  89. WARN_ON_ONCE(1);
  90. return -1;
  91. }
  92. return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
  93. }
  94. /*
  95. * This is a relatively normal page table walk, except that it
  96. * also tries to allocate page tables pages along the way.
  97. *
  98. * Returns a pointer to a PTE on success, or NULL on failure.
  99. */
  100. static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
  101. {
  102. pmd_t *pmd;
  103. pud_t *pud;
  104. pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
  105. gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
  106. unsigned long prot = _KERNPG_TABLE;
  107. if (pgd_none(*pgd)) {
  108. WARN_ONCE(1, "All shadow pgds should have been populated");
  109. return NULL;
  110. }
  111. BUILD_BUG_ON(pgd_large(*pgd) != 0);
  112. if (user) {
  113. /*
  114. * The vsyscall page is the only page that will have
  115. * _PAGE_USER set. Catch everything else.
  116. */
  117. BUG_ON(address != VSYSCALL_ADDR);
  118. set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
  119. prot = _PAGE_TABLE;
  120. }
  121. pud = pud_offset(pgd, address);
  122. /* The shadow page tables do not use large mappings: */
  123. if (pud_large(*pud)) {
  124. WARN_ON(1);
  125. return NULL;
  126. }
  127. if (pud_none(*pud)) {
  128. unsigned long new_pmd_page = __get_free_page(gfp);
  129. if (!new_pmd_page)
  130. return NULL;
  131. spin_lock(&shadow_table_allocation_lock);
  132. if (pud_none(*pud)) {
  133. set_pud(pud, __pud(prot | __pa(new_pmd_page)));
  134. __inc_zone_page_state(virt_to_page((void *)
  135. new_pmd_page), NR_KAISERTABLE);
  136. } else
  137. free_page(new_pmd_page);
  138. spin_unlock(&shadow_table_allocation_lock);
  139. }
  140. pmd = pmd_offset(pud, address);
  141. /* The shadow page tables do not use large mappings: */
  142. if (pmd_large(*pmd)) {
  143. WARN_ON(1);
  144. return NULL;
  145. }
  146. if (pmd_none(*pmd)) {
  147. unsigned long new_pte_page = __get_free_page(gfp);
  148. if (!new_pte_page)
  149. return NULL;
  150. spin_lock(&shadow_table_allocation_lock);
  151. if (pmd_none(*pmd)) {
  152. set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
  153. __inc_zone_page_state(virt_to_page((void *)
  154. new_pte_page), NR_KAISERTABLE);
  155. } else
  156. free_page(new_pte_page);
  157. spin_unlock(&shadow_table_allocation_lock);
  158. }
  159. return pte_offset_kernel(pmd, address);
  160. }
  161. static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
  162. unsigned long flags)
  163. {
  164. int ret = 0;
  165. pte_t *pte;
  166. unsigned long start_addr = (unsigned long )__start_addr;
  167. unsigned long address = start_addr & PAGE_MASK;
  168. unsigned long end_addr = PAGE_ALIGN(start_addr + size);
  169. unsigned long target_address;
  170. /*
  171. * It is convenient for callers to pass in __PAGE_KERNEL etc,
  172. * and there is no actual harm from setting _PAGE_GLOBAL, so
  173. * long as CR4.PGE is not set. But it is nonetheless troubling
  174. * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
  175. * requires that not to be #defined to 0): so mask it off here.
  176. */
  177. flags &= ~_PAGE_GLOBAL;
  178. if (!(__supported_pte_mask & _PAGE_NX))
  179. flags &= ~_PAGE_NX;
  180. for (; address < end_addr; address += PAGE_SIZE) {
  181. target_address = get_pa_from_mapping(address);
  182. if (target_address == -1) {
  183. ret = -EIO;
  184. break;
  185. }
  186. pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
  187. if (!pte) {
  188. ret = -ENOMEM;
  189. break;
  190. }
  191. if (pte_none(*pte)) {
  192. set_pte(pte, __pte(flags | target_address));
  193. } else {
  194. pte_t tmp;
  195. set_pte(&tmp, __pte(flags | target_address));
  196. WARN_ON_ONCE(!pte_same(*pte, tmp));
  197. }
  198. }
  199. return ret;
  200. }
  201. static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
  202. {
  203. unsigned long size = end - start;
  204. return kaiser_add_user_map(start, size, flags);
  205. }
  206. /*
  207. * Ensure that the top level of the (shadow) page tables are
  208. * entirely populated. This ensures that all processes that get
  209. * forked have the same entries. This way, we do not have to
  210. * ever go set up new entries in older processes.
  211. *
  212. * Note: we never free these, so there are no updates to them
  213. * after this.
  214. */
  215. static void __init kaiser_init_all_pgds(void)
  216. {
  217. pgd_t *pgd;
  218. int i = 0;
  219. pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
  220. for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
  221. pgd_t new_pgd;
  222. pud_t *pud = pud_alloc_one(&init_mm,
  223. PAGE_OFFSET + i * PGDIR_SIZE);
  224. if (!pud) {
  225. WARN_ON(1);
  226. break;
  227. }
  228. inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
  229. new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
  230. /*
  231. * Make sure not to stomp on some other pgd entry.
  232. */
  233. if (!pgd_none(pgd[i])) {
  234. WARN_ON(1);
  235. continue;
  236. }
  237. set_pgd(pgd + i, new_pgd);
  238. }
  239. }
  240. #define kaiser_add_user_map_early(start, size, flags) do { \
  241. int __ret = kaiser_add_user_map(start, size, flags); \
  242. WARN_ON(__ret); \
  243. } while (0)
  244. #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
  245. int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
  246. WARN_ON(__ret); \
  247. } while (0)
  248. void __init kaiser_check_boottime_disable(void)
  249. {
  250. bool enable = true;
  251. char arg[5];
  252. int ret;
  253. if (boot_cpu_has(X86_FEATURE_XENPV))
  254. goto silent_disable;
  255. ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
  256. if (ret > 0) {
  257. if (!strncmp(arg, "on", 2))
  258. goto enable;
  259. if (!strncmp(arg, "off", 3))
  260. goto disable;
  261. if (!strncmp(arg, "auto", 4))
  262. goto skip;
  263. }
  264. if (cmdline_find_option_bool(boot_command_line, "nopti"))
  265. goto disable;
  266. skip:
  267. if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
  268. goto disable;
  269. enable:
  270. if (enable)
  271. setup_force_cpu_cap(X86_FEATURE_KAISER);
  272. return;
  273. disable:
  274. pr_info("disabled\n");
  275. silent_disable:
  276. kaiser_enabled = 0;
  277. setup_clear_cpu_cap(X86_FEATURE_KAISER);
  278. }
  279. /*
  280. * If anything in here fails, we will likely die on one of the
  281. * first kernel->user transitions and init will die. But, we
  282. * will have most of the kernel up by then and should be able to
  283. * get a clean warning out of it. If we BUG_ON() here, we run
  284. * the risk of being before we have good console output.
  285. */
  286. void __init kaiser_init(void)
  287. {
  288. int cpu;
  289. if (!kaiser_enabled)
  290. return;
  291. kaiser_init_all_pgds();
  292. /*
  293. * Note that this sets _PAGE_USER and it needs to happen when the
  294. * pagetable hierarchy gets created, i.e., early. Otherwise
  295. * kaiser_pagetable_walk() will encounter initialized PTEs in the
  296. * hierarchy and not set the proper permissions, leading to the
  297. * pagefaults with page-protection violations when trying to read the
  298. * vsyscall page. For example.
  299. */
  300. if (vsyscall_enabled())
  301. kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
  302. PAGE_SIZE,
  303. vsyscall_pgprot);
  304. for_each_possible_cpu(cpu) {
  305. void *percpu_vaddr = __per_cpu_user_mapped_start +
  306. per_cpu_offset(cpu);
  307. unsigned long percpu_sz = __per_cpu_user_mapped_end -
  308. __per_cpu_user_mapped_start;
  309. kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
  310. __PAGE_KERNEL);
  311. }
  312. /*
  313. * Map the entry/exit text section, which is needed at
  314. * switches from user to and from kernel.
  315. */
  316. kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
  317. __PAGE_KERNEL_RX);
  318. #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  319. kaiser_add_user_map_ptrs_early(__irqentry_text_start,
  320. __irqentry_text_end,
  321. __PAGE_KERNEL_RX);
  322. #endif
  323. kaiser_add_user_map_early((void *)idt_descr.address,
  324. sizeof(gate_desc) * NR_VECTORS,
  325. __PAGE_KERNEL_RO);
  326. #ifdef CONFIG_TRACING
  327. kaiser_add_user_map_early(&trace_idt_descr,
  328. sizeof(trace_idt_descr),
  329. __PAGE_KERNEL);
  330. kaiser_add_user_map_early(&trace_idt_table,
  331. sizeof(gate_desc) * NR_VECTORS,
  332. __PAGE_KERNEL);
  333. #endif
  334. kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
  335. __PAGE_KERNEL);
  336. kaiser_add_user_map_early(&debug_idt_table,
  337. sizeof(gate_desc) * NR_VECTORS,
  338. __PAGE_KERNEL);
  339. pr_info("enabled\n");
  340. }
  341. /* Add a mapping to the shadow mapping, and synchronize the mappings */
  342. int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
  343. {
  344. if (!kaiser_enabled)
  345. return 0;
  346. return kaiser_add_user_map((const void *)addr, size, flags);
  347. }
  348. void kaiser_remove_mapping(unsigned long start, unsigned long size)
  349. {
  350. extern void unmap_pud_range_nofree(pgd_t *pgd,
  351. unsigned long start, unsigned long end);
  352. unsigned long end = start + size;
  353. unsigned long addr, next;
  354. pgd_t *pgd;
  355. if (!kaiser_enabled)
  356. return;
  357. pgd = native_get_shadow_pgd(pgd_offset_k(start));
  358. for (addr = start; addr < end; pgd++, addr = next) {
  359. next = pgd_addr_end(addr, end);
  360. unmap_pud_range_nofree(pgd, addr, next);
  361. }
  362. }
  363. /*
  364. * Page table pages are page-aligned. The lower half of the top
  365. * level is used for userspace and the top half for the kernel.
  366. * This returns true for user pages that need to get copied into
  367. * both the user and kernel copies of the page tables, and false
  368. * for kernel pages that should only be in the kernel copy.
  369. */
  370. static inline bool is_userspace_pgd(pgd_t *pgdp)
  371. {
  372. return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
  373. }
  374. pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
  375. {
  376. if (!kaiser_enabled)
  377. return pgd;
  378. /*
  379. * Do we need to also populate the shadow pgd? Check _PAGE_USER to
  380. * skip cases like kexec and EFI which make temporary low mappings.
  381. */
  382. if (pgd.pgd & _PAGE_USER) {
  383. if (is_userspace_pgd(pgdp)) {
  384. native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
  385. /*
  386. * Even if the entry is *mapping* userspace, ensure
  387. * that userspace can not use it. This way, if we
  388. * get out to userspace running on the kernel CR3,
  389. * userspace will crash instead of running.
  390. */
  391. if (__supported_pte_mask & _PAGE_NX)
  392. pgd.pgd |= _PAGE_NX;
  393. }
  394. } else if (!pgd.pgd) {
  395. /*
  396. * pgd_clear() cannot check _PAGE_USER, and is even used to
  397. * clear corrupted pgd entries: so just rely on cases like
  398. * kexec and EFI never to be using pgd_clear().
  399. */
  400. if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
  401. is_userspace_pgd(pgdp))
  402. native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
  403. }
  404. return pgd;
  405. }
  406. void kaiser_setup_pcid(void)
  407. {
  408. unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
  409. if (this_cpu_has(X86_FEATURE_PCID))
  410. user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
  411. /*
  412. * These variables are used by the entry/exit
  413. * code to change PCID and pgd and TLB flushing.
  414. */
  415. this_cpu_write(x86_cr3_pcid_user, user_cr3);
  416. }
  417. /*
  418. * Make a note that this cpu will need to flush USER tlb on return to user.
  419. * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
  420. */
  421. void kaiser_flush_tlb_on_return_to_user(void)
  422. {
  423. if (this_cpu_has(X86_FEATURE_PCID))
  424. this_cpu_write(x86_cr3_pcid_user,
  425. X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
  426. }
  427. EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);