entry_64.S 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624
  1. /*
  2. * linux/arch/x86_64/entry.S
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
  6. * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
  7. *
  8. * entry.S contains the system-call and fault low-level handling routines.
  9. *
  10. * Some of this is documented in Documentation/x86/entry_64.txt
  11. *
  12. * A note on terminology:
  13. * - iret frame: Architecture defined interrupt frame from SS to RIP
  14. * at the top of the kernel process stack.
  15. *
  16. * Some macro usage:
  17. * - ENTRY/END: Define functions in the symbol table.
  18. * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
  19. * - idtentry: Define exception entry points.
  20. */
  21. #include <linux/linkage.h>
  22. #include <asm/segment.h>
  23. #include <asm/cache.h>
  24. #include <asm/errno.h>
  25. #include "calling.h"
  26. #include <asm/asm-offsets.h>
  27. #include <asm/msr.h>
  28. #include <asm/unistd.h>
  29. #include <asm/thread_info.h>
  30. #include <asm/hw_irq.h>
  31. #include <asm/page_types.h>
  32. #include <asm/irqflags.h>
  33. #include <asm/paravirt.h>
  34. #include <asm/percpu.h>
  35. #include <asm/asm.h>
  36. #include <asm/smap.h>
  37. #include <asm/pgtable_types.h>
  38. #include <asm/kaiser.h>
  39. #include <asm/nospec-branch.h>
  40. #include <linux/err.h>
  41. /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
  42. #include <linux/elf-em.h>
  43. #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
  44. #define __AUDIT_ARCH_64BIT 0x80000000
  45. #define __AUDIT_ARCH_LE 0x40000000
  46. .code64
  47. .section .entry.text, "ax"
  48. #ifdef CONFIG_PARAVIRT
  49. ENTRY(native_usergs_sysret64)
  50. swapgs
  51. sysretq
  52. ENDPROC(native_usergs_sysret64)
  53. #endif /* CONFIG_PARAVIRT */
  54. .macro TRACE_IRQS_IRETQ
  55. #ifdef CONFIG_TRACE_IRQFLAGS
  56. bt $9, EFLAGS(%rsp) /* interrupts off? */
  57. jnc 1f
  58. TRACE_IRQS_ON
  59. 1:
  60. #endif
  61. .endm
  62. /*
  63. * When dynamic function tracer is enabled it will add a breakpoint
  64. * to all locations that it is about to modify, sync CPUs, update
  65. * all the code, sync CPUs, then remove the breakpoints. In this time
  66. * if lockdep is enabled, it might jump back into the debug handler
  67. * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
  68. *
  69. * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
  70. * make sure the stack pointer does not get reset back to the top
  71. * of the debug stack, and instead just reuses the current stack.
  72. */
  73. #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
  74. .macro TRACE_IRQS_OFF_DEBUG
  75. call debug_stack_set_zero
  76. TRACE_IRQS_OFF
  77. call debug_stack_reset
  78. .endm
  79. .macro TRACE_IRQS_ON_DEBUG
  80. call debug_stack_set_zero
  81. TRACE_IRQS_ON
  82. call debug_stack_reset
  83. .endm
  84. .macro TRACE_IRQS_IRETQ_DEBUG
  85. btl $9, EFLAGS(%rsp) /* interrupts off? */
  86. jnc 1f
  87. TRACE_IRQS_ON_DEBUG
  88. 1:
  89. .endm
  90. #else
  91. # define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
  92. # define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
  93. # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
  94. #endif
  95. /*
  96. * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  97. *
  98. * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  99. * then loads new ss, cs, and rip from previously programmed MSRs.
  100. * rflags gets masked by a value from another MSR (so CLD and CLAC
  101. * are not needed). SYSCALL does not save anything on the stack
  102. * and does not change rsp.
  103. *
  104. * Registers on entry:
  105. * rax system call number
  106. * rcx return address
  107. * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
  108. * rdi arg0
  109. * rsi arg1
  110. * rdx arg2
  111. * r10 arg3 (needs to be moved to rcx to conform to C ABI)
  112. * r8 arg4
  113. * r9 arg5
  114. * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
  115. *
  116. * Only called from user space.
  117. *
  118. * When user can change pt_regs->foo always force IRET. That is because
  119. * it deals with uncanonical addresses better. SYSRET has trouble
  120. * with them due to bugs in both AMD and Intel CPUs.
  121. */
  122. ENTRY(entry_SYSCALL_64)
  123. /*
  124. * Interrupts are off on entry.
  125. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
  126. * it is too small to ever cause noticeable irq latency.
  127. */
  128. SWAPGS_UNSAFE_STACK
  129. SWITCH_KERNEL_CR3_NO_STACK
  130. /*
  131. * A hypervisor implementation might want to use a label
  132. * after the swapgs, so that it can do the swapgs
  133. * for the guest and jump here on syscall.
  134. */
  135. GLOBAL(entry_SYSCALL_64_after_swapgs)
  136. movq %rsp, PER_CPU_VAR(rsp_scratch)
  137. movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  138. /* Construct struct pt_regs on stack */
  139. pushq $__USER_DS /* pt_regs->ss */
  140. pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
  141. /*
  142. * Re-enable interrupts.
  143. * We use 'rsp_scratch' as a scratch space, hence irq-off block above
  144. * must execute atomically in the face of possible interrupt-driven
  145. * task preemption. We must enable interrupts only after we're done
  146. * with using rsp_scratch:
  147. */
  148. ENABLE_INTERRUPTS(CLBR_NONE)
  149. pushq %r11 /* pt_regs->flags */
  150. pushq $__USER_CS /* pt_regs->cs */
  151. pushq %rcx /* pt_regs->ip */
  152. pushq %rax /* pt_regs->orig_ax */
  153. pushq %rdi /* pt_regs->di */
  154. pushq %rsi /* pt_regs->si */
  155. pushq %rdx /* pt_regs->dx */
  156. pushq %rcx /* pt_regs->cx */
  157. pushq $-ENOSYS /* pt_regs->ax */
  158. pushq %r8 /* pt_regs->r8 */
  159. pushq %r9 /* pt_regs->r9 */
  160. pushq %r10 /* pt_regs->r10 */
  161. pushq %r11 /* pt_regs->r11 */
  162. sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
  163. testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
  164. jnz tracesys
  165. entry_SYSCALL_64_fastpath:
  166. #if __SYSCALL_MASK == ~0
  167. cmpq $NR_syscalls, %rax
  168. #else
  169. andl $__SYSCALL_MASK, %eax
  170. cmpl $NR_syscalls, %eax
  171. #endif
  172. jae 1f /* return -ENOSYS (already in pt_regs->ax) */
  173. sbb %rcx, %rcx /* array_index_mask_nospec() */
  174. and %rcx, %rax
  175. movq %r10, %rcx
  176. #ifdef CONFIG_RETPOLINE
  177. movq sys_call_table(, %rax, 8), %rax
  178. call __x86_indirect_thunk_rax
  179. #else
  180. call *sys_call_table(, %rax, 8)
  181. #endif
  182. movq %rax, RAX(%rsp)
  183. 1:
  184. /*
  185. * Syscall return path ending with SYSRET (fast path).
  186. * Has incompletely filled pt_regs.
  187. */
  188. LOCKDEP_SYS_EXIT
  189. /*
  190. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
  191. * it is too small to ever cause noticeable irq latency.
  192. */
  193. DISABLE_INTERRUPTS(CLBR_NONE)
  194. /*
  195. * We must check ti flags with interrupts (or at least preemption)
  196. * off because we must *never* return to userspace without
  197. * processing exit work that is enqueued if we're preempted here.
  198. * In particular, returning to userspace with any of the one-shot
  199. * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
  200. * very bad.
  201. */
  202. testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
  203. jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
  204. movq RIP(%rsp), %rcx
  205. movq EFLAGS(%rsp), %r11
  206. RESTORE_C_REGS_EXCEPT_RCX_R11
  207. /*
  208. * This opens a window where we have a user CR3, but are
  209. * running in the kernel. This makes using the CS
  210. * register useless for telling whether or not we need to
  211. * switch CR3 in NMIs. Normal interrupts are OK because
  212. * they are off here.
  213. */
  214. SWITCH_USER_CR3
  215. movq RSP(%rsp), %rsp
  216. /*
  217. * 64-bit SYSRET restores rip from rcx,
  218. * rflags from r11 (but RF and VM bits are forced to 0),
  219. * cs and ss are loaded from MSRs.
  220. * Restoration of rflags re-enables interrupts.
  221. *
  222. * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
  223. * descriptor is not reinitialized. This means that we should
  224. * avoid SYSRET with SS == NULL, which could happen if we schedule,
  225. * exit the kernel, and re-enter using an interrupt vector. (All
  226. * interrupt entries on x86_64 set SS to NULL.) We prevent that
  227. * from happening by reloading SS in __switch_to. (Actually
  228. * detecting the failure in 64-bit userspace is tricky but can be
  229. * done.)
  230. */
  231. USERGS_SYSRET64
  232. GLOBAL(int_ret_from_sys_call_irqs_off)
  233. TRACE_IRQS_ON
  234. ENABLE_INTERRUPTS(CLBR_NONE)
  235. jmp int_ret_from_sys_call
  236. /* Do syscall entry tracing */
  237. tracesys:
  238. movq %rsp, %rdi
  239. movl $AUDIT_ARCH_X86_64, %esi
  240. call syscall_trace_enter_phase1
  241. test %rax, %rax
  242. jnz tracesys_phase2 /* if needed, run the slow path */
  243. RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
  244. movq ORIG_RAX(%rsp), %rax
  245. jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
  246. tracesys_phase2:
  247. SAVE_EXTRA_REGS
  248. movq %rsp, %rdi
  249. movl $AUDIT_ARCH_X86_64, %esi
  250. movq %rax, %rdx
  251. call syscall_trace_enter_phase2
  252. /*
  253. * Reload registers from stack in case ptrace changed them.
  254. * We don't reload %rax because syscall_trace_entry_phase2() returned
  255. * the value it wants us to use in the table lookup.
  256. */
  257. RESTORE_C_REGS_EXCEPT_RAX
  258. RESTORE_EXTRA_REGS
  259. #if __SYSCALL_MASK == ~0
  260. cmpq $NR_syscalls, %rax
  261. #else
  262. andl $__SYSCALL_MASK, %eax
  263. cmpl $NR_syscalls, %eax
  264. #endif
  265. jae 1f /* return -ENOSYS (already in pt_regs->ax) */
  266. sbb %rcx, %rcx /* array_index_mask_nospec() */
  267. and %rcx, %rax
  268. movq %r10, %rcx /* fixup for C */
  269. #ifdef CONFIG_RETPOLINE
  270. movq sys_call_table(, %rax, 8), %rax
  271. call __x86_indirect_thunk_rax
  272. #else
  273. call *sys_call_table(, %rax, 8)
  274. #endif
  275. movq %rax, RAX(%rsp)
  276. 1:
  277. /* Use IRET because user could have changed pt_regs->foo */
  278. /*
  279. * Syscall return path ending with IRET.
  280. * Has correct iret frame.
  281. */
  282. GLOBAL(int_ret_from_sys_call)
  283. SAVE_EXTRA_REGS
  284. movq %rsp, %rdi
  285. call syscall_return_slowpath /* returns with IRQs disabled */
  286. RESTORE_EXTRA_REGS
  287. TRACE_IRQS_IRETQ /* we're about to change IF */
  288. /*
  289. * Try to use SYSRET instead of IRET if we're returning to
  290. * a completely clean 64-bit userspace context.
  291. */
  292. movq RCX(%rsp), %rcx
  293. movq RIP(%rsp), %r11
  294. cmpq %rcx, %r11 /* RCX == RIP */
  295. jne opportunistic_sysret_failed
  296. /*
  297. * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
  298. * in kernel space. This essentially lets the user take over
  299. * the kernel, since userspace controls RSP.
  300. *
  301. * If width of "canonical tail" ever becomes variable, this will need
  302. * to be updated to remain correct on both old and new CPUs.
  303. */
  304. .ifne __VIRTUAL_MASK_SHIFT - 47
  305. .error "virtual address width changed -- SYSRET checks need update"
  306. .endif
  307. /* Change top 16 bits to be the sign-extension of 47th bit */
  308. shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
  309. sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
  310. /* If this changed %rcx, it was not canonical */
  311. cmpq %rcx, %r11
  312. jne opportunistic_sysret_failed
  313. cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
  314. jne opportunistic_sysret_failed
  315. movq R11(%rsp), %r11
  316. cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
  317. jne opportunistic_sysret_failed
  318. /*
  319. * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
  320. * restoring TF results in a trap from userspace immediately after
  321. * SYSRET. This would cause an infinite loop whenever #DB happens
  322. * with register state that satisfies the opportunistic SYSRET
  323. * conditions. For example, single-stepping this user code:
  324. *
  325. * movq $stuck_here, %rcx
  326. * pushfq
  327. * popq %r11
  328. * stuck_here:
  329. *
  330. * would never get past 'stuck_here'.
  331. */
  332. testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
  333. jnz opportunistic_sysret_failed
  334. /* nothing to check for RSP */
  335. cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
  336. jne opportunistic_sysret_failed
  337. /*
  338. * We win! This label is here just for ease of understanding
  339. * perf profiles. Nothing jumps here.
  340. */
  341. syscall_return_via_sysret:
  342. /* rcx and r11 are already restored (see code above) */
  343. RESTORE_C_REGS_EXCEPT_RCX_R11
  344. /*
  345. * This opens a window where we have a user CR3, but are
  346. * running in the kernel. This makes using the CS
  347. * register useless for telling whether or not we need to
  348. * switch CR3 in NMIs. Normal interrupts are OK because
  349. * they are off here.
  350. */
  351. SWITCH_USER_CR3
  352. movq RSP(%rsp), %rsp
  353. USERGS_SYSRET64
  354. opportunistic_sysret_failed:
  355. /*
  356. * This opens a window where we have a user CR3, but are
  357. * running in the kernel. This makes using the CS
  358. * register useless for telling whether or not we need to
  359. * switch CR3 in NMIs. Normal interrupts are OK because
  360. * they are off here.
  361. */
  362. SWITCH_USER_CR3
  363. SWAPGS
  364. jmp restore_c_regs_and_iret
  365. END(entry_SYSCALL_64)
  366. .macro FORK_LIKE func
  367. ENTRY(stub_\func)
  368. SAVE_EXTRA_REGS 8
  369. jmp sys_\func
  370. END(stub_\func)
  371. .endm
  372. FORK_LIKE clone
  373. FORK_LIKE fork
  374. FORK_LIKE vfork
  375. ENTRY(stub_execve)
  376. call sys_execve
  377. return_from_execve:
  378. testl %eax, %eax
  379. jz 1f
  380. /* exec failed, can use fast SYSRET code path in this case */
  381. ret
  382. 1:
  383. /* must use IRET code path (pt_regs->cs may have changed) */
  384. addq $8, %rsp
  385. ZERO_EXTRA_REGS
  386. movq %rax, RAX(%rsp)
  387. jmp int_ret_from_sys_call
  388. END(stub_execve)
  389. /*
  390. * Remaining execve stubs are only 7 bytes long.
  391. * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
  392. */
  393. .align 8
  394. GLOBAL(stub_execveat)
  395. call sys_execveat
  396. jmp return_from_execve
  397. END(stub_execveat)
  398. #if defined(CONFIG_X86_X32_ABI)
  399. .align 8
  400. GLOBAL(stub_x32_execve)
  401. call compat_sys_execve
  402. jmp return_from_execve
  403. END(stub_x32_execve)
  404. .align 8
  405. GLOBAL(stub_x32_execveat)
  406. call compat_sys_execveat
  407. jmp return_from_execve
  408. END(stub_x32_execveat)
  409. #endif
  410. /*
  411. * sigreturn is special because it needs to restore all registers on return.
  412. * This cannot be done with SYSRET, so use the IRET return path instead.
  413. */
  414. ENTRY(stub_rt_sigreturn)
  415. /*
  416. * SAVE_EXTRA_REGS result is not normally needed:
  417. * sigreturn overwrites all pt_regs->GPREGS.
  418. * But sigreturn can fail (!), and there is no easy way to detect that.
  419. * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
  420. * we SAVE_EXTRA_REGS here.
  421. */
  422. SAVE_EXTRA_REGS 8
  423. call sys_rt_sigreturn
  424. return_from_stub:
  425. addq $8, %rsp
  426. RESTORE_EXTRA_REGS
  427. movq %rax, RAX(%rsp)
  428. jmp int_ret_from_sys_call
  429. END(stub_rt_sigreturn)
  430. #ifdef CONFIG_X86_X32_ABI
  431. ENTRY(stub_x32_rt_sigreturn)
  432. SAVE_EXTRA_REGS 8
  433. call sys32_x32_rt_sigreturn
  434. jmp return_from_stub
  435. END(stub_x32_rt_sigreturn)
  436. #endif
  437. /*
  438. * A newly forked process directly context switches into this address.
  439. *
  440. * rdi: prev task we switched from
  441. */
  442. ENTRY(ret_from_fork)
  443. LOCK ; btr $TIF_FORK, TI_flags(%r8)
  444. pushq $0x0002
  445. popfq /* reset kernel eflags */
  446. call schedule_tail /* rdi: 'prev' task parameter */
  447. RESTORE_EXTRA_REGS
  448. testb $3, CS(%rsp) /* from kernel_thread? */
  449. /*
  450. * By the time we get here, we have no idea whether our pt_regs,
  451. * ti flags, and ti status came from the 64-bit SYSCALL fast path,
  452. * the slow path, or one of the 32-bit compat paths.
  453. * Use IRET code path to return, since it can safely handle
  454. * all of the above.
  455. */
  456. jnz int_ret_from_sys_call
  457. /*
  458. * We came from kernel_thread
  459. * nb: we depend on RESTORE_EXTRA_REGS above
  460. */
  461. movq %rbp, %rdi
  462. CALL_NOSPEC %rbx
  463. movl $0, RAX(%rsp)
  464. RESTORE_EXTRA_REGS
  465. jmp int_ret_from_sys_call
  466. END(ret_from_fork)
  467. /*
  468. * Build the entry stubs with some assembler magic.
  469. * We pack 1 stub into every 8-byte block.
  470. */
  471. .align 8
  472. ENTRY(irq_entries_start)
  473. vector=FIRST_EXTERNAL_VECTOR
  474. .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
  475. pushq $(~vector+0x80) /* Note: always in signed byte range */
  476. vector=vector+1
  477. jmp common_interrupt
  478. .align 8
  479. .endr
  480. END(irq_entries_start)
  481. /*
  482. * Interrupt entry/exit.
  483. *
  484. * Interrupt entry points save only callee clobbered registers in fast path.
  485. *
  486. * Entry runs with interrupts off.
  487. */
  488. /* 0(%rsp): ~(interrupt number) */
  489. .macro interrupt func
  490. cld
  491. ALLOC_PT_GPREGS_ON_STACK
  492. SAVE_C_REGS
  493. SAVE_EXTRA_REGS
  494. testb $3, CS(%rsp)
  495. jz 1f
  496. /*
  497. * IRQ from user mode. Switch to kernel gsbase and inform context
  498. * tracking that we're in kernel mode.
  499. */
  500. SWAPGS
  501. SWITCH_KERNEL_CR3
  502. /*
  503. * We need to tell lockdep that IRQs are off. We can't do this until
  504. * we fix gsbase, and we should do it before enter_from_user_mode
  505. * (which can take locks). Since TRACE_IRQS_OFF idempotent,
  506. * the simplest way to handle it is to just call it twice if
  507. * we enter from user mode. There's no reason to optimize this since
  508. * TRACE_IRQS_OFF is a no-op if lockdep is off.
  509. */
  510. TRACE_IRQS_OFF
  511. #ifdef CONFIG_CONTEXT_TRACKING
  512. call enter_from_user_mode
  513. #endif
  514. 1:
  515. /*
  516. * Save previous stack pointer, optionally switch to interrupt stack.
  517. * irq_count is used to check if a CPU is already on an interrupt stack
  518. * or not. While this is essentially redundant with preempt_count it is
  519. * a little cheaper to use a separate counter in the PDA (short of
  520. * moving irq_enter into assembly, which would be too much work)
  521. */
  522. movq %rsp, %rdi
  523. incl PER_CPU_VAR(irq_count)
  524. cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
  525. pushq %rdi
  526. /* We entered an interrupt context - irqs are off: */
  527. TRACE_IRQS_OFF
  528. call \func /* rdi points to pt_regs */
  529. .endm
  530. /*
  531. * The interrupt stubs push (~vector+0x80) onto the stack and
  532. * then jump to common_interrupt.
  533. */
  534. .p2align CONFIG_X86_L1_CACHE_SHIFT
  535. common_interrupt:
  536. ASM_CLAC
  537. addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
  538. interrupt do_IRQ
  539. /* 0(%rsp): old RSP */
  540. ret_from_intr:
  541. DISABLE_INTERRUPTS(CLBR_NONE)
  542. TRACE_IRQS_OFF
  543. decl PER_CPU_VAR(irq_count)
  544. /* Restore saved previous stack */
  545. popq %rsp
  546. testb $3, CS(%rsp)
  547. jz retint_kernel
  548. /* Interrupt came from user space */
  549. GLOBAL(retint_user)
  550. mov %rsp,%rdi
  551. call prepare_exit_to_usermode
  552. TRACE_IRQS_IRETQ
  553. SWITCH_USER_CR3
  554. SWAPGS
  555. jmp restore_regs_and_iret
  556. /* Returning to kernel space */
  557. retint_kernel:
  558. #ifdef CONFIG_PREEMPT
  559. /* Interrupts are off */
  560. /* Check if we need preemption */
  561. btl $9, EFLAGS(%rsp) /* were interrupts off? */
  562. jnc 1f
  563. 0: cmpl $0, PER_CPU_VAR(__preempt_count)
  564. jnz 1f
  565. call preempt_schedule_irq
  566. jmp 0b
  567. 1:
  568. #endif
  569. /*
  570. * The iretq could re-enable interrupts:
  571. */
  572. TRACE_IRQS_IRETQ
  573. /*
  574. * At this label, code paths which return to kernel and to user,
  575. * which come from interrupts/exception and from syscalls, merge.
  576. */
  577. GLOBAL(restore_regs_and_iret)
  578. RESTORE_EXTRA_REGS
  579. restore_c_regs_and_iret:
  580. RESTORE_C_REGS
  581. REMOVE_PT_GPREGS_FROM_STACK 8
  582. INTERRUPT_RETURN
  583. ENTRY(native_iret)
  584. /*
  585. * Are we returning to a stack segment from the LDT? Note: in
  586. * 64-bit mode SS:RSP on the exception stack is always valid.
  587. */
  588. #ifdef CONFIG_X86_ESPFIX64
  589. testb $4, (SS-RIP)(%rsp)
  590. jnz native_irq_return_ldt
  591. #endif
  592. .global native_irq_return_iret
  593. native_irq_return_iret:
  594. /*
  595. * This may fault. Non-paranoid faults on return to userspace are
  596. * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
  597. * Double-faults due to espfix64 are handled in do_double_fault.
  598. * Other faults here are fatal.
  599. */
  600. iretq
  601. #ifdef CONFIG_X86_ESPFIX64
  602. native_irq_return_ldt:
  603. pushq %rax
  604. pushq %rdi
  605. SWAPGS
  606. SWITCH_KERNEL_CR3
  607. movq PER_CPU_VAR(espfix_waddr), %rdi
  608. movq %rax, (0*8)(%rdi) /* RAX */
  609. movq (2*8)(%rsp), %rax /* RIP */
  610. movq %rax, (1*8)(%rdi)
  611. movq (3*8)(%rsp), %rax /* CS */
  612. movq %rax, (2*8)(%rdi)
  613. movq (4*8)(%rsp), %rax /* RFLAGS */
  614. movq %rax, (3*8)(%rdi)
  615. movq (6*8)(%rsp), %rax /* SS */
  616. movq %rax, (5*8)(%rdi)
  617. movq (5*8)(%rsp), %rax /* RSP */
  618. movq %rax, (4*8)(%rdi)
  619. andl $0xffff0000, %eax
  620. popq %rdi
  621. orq PER_CPU_VAR(espfix_stack), %rax
  622. SWITCH_USER_CR3
  623. SWAPGS
  624. movq %rax, %rsp
  625. popq %rax
  626. jmp native_irq_return_iret
  627. #endif
  628. END(common_interrupt)
  629. /*
  630. * APIC interrupts.
  631. */
  632. .macro apicinterrupt3 num sym do_sym
  633. ENTRY(\sym)
  634. ASM_CLAC
  635. pushq $~(\num)
  636. .Lcommon_\sym:
  637. interrupt \do_sym
  638. jmp ret_from_intr
  639. END(\sym)
  640. .endm
  641. #ifdef CONFIG_TRACING
  642. #define trace(sym) trace_##sym
  643. #define smp_trace(sym) smp_trace_##sym
  644. .macro trace_apicinterrupt num sym
  645. apicinterrupt3 \num trace(\sym) smp_trace(\sym)
  646. .endm
  647. #else
  648. .macro trace_apicinterrupt num sym do_sym
  649. .endm
  650. #endif
  651. .macro apicinterrupt num sym do_sym
  652. apicinterrupt3 \num \sym \do_sym
  653. trace_apicinterrupt \num \sym
  654. .endm
  655. #ifdef CONFIG_SMP
  656. apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
  657. apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
  658. #endif
  659. #ifdef CONFIG_X86_UV
  660. apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
  661. #endif
  662. apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
  663. apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
  664. #ifdef CONFIG_HAVE_KVM
  665. apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
  666. apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
  667. #endif
  668. #ifdef CONFIG_X86_MCE_THRESHOLD
  669. apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
  670. #endif
  671. #ifdef CONFIG_X86_MCE_AMD
  672. apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
  673. #endif
  674. #ifdef CONFIG_X86_THERMAL_VECTOR
  675. apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
  676. #endif
  677. #ifdef CONFIG_SMP
  678. apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
  679. apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
  680. apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
  681. #endif
  682. apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
  683. apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
  684. #ifdef CONFIG_IRQ_WORK
  685. apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
  686. #endif
  687. /*
  688. * Exception entry points.
  689. */
  690. #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
  691. .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
  692. ENTRY(\sym)
  693. /* Sanity check */
  694. .if \shift_ist != -1 && \paranoid == 0
  695. .error "using shift_ist requires paranoid=1"
  696. .endif
  697. ASM_CLAC
  698. PARAVIRT_ADJUST_EXCEPTION_FRAME
  699. .ifeq \has_error_code
  700. pushq $-1 /* ORIG_RAX: no syscall to restart */
  701. .endif
  702. ALLOC_PT_GPREGS_ON_STACK
  703. .if \paranoid
  704. .if \paranoid == 1
  705. testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
  706. jnz 1f
  707. .endif
  708. call paranoid_entry
  709. .else
  710. call error_entry
  711. .endif
  712. /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
  713. .if \paranoid
  714. .if \shift_ist != -1
  715. TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
  716. .else
  717. TRACE_IRQS_OFF
  718. .endif
  719. .endif
  720. movq %rsp, %rdi /* pt_regs pointer */
  721. .if \has_error_code
  722. movq ORIG_RAX(%rsp), %rsi /* get error code */
  723. movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
  724. .else
  725. xorl %esi, %esi /* no error code */
  726. .endif
  727. .if \shift_ist != -1
  728. subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
  729. .endif
  730. call \do_sym
  731. .if \shift_ist != -1
  732. addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
  733. .endif
  734. /* these procedures expect "no swapgs" flag in ebx */
  735. .if \paranoid
  736. jmp paranoid_exit
  737. .else
  738. jmp error_exit
  739. .endif
  740. .if \paranoid == 1
  741. /*
  742. * Paranoid entry from userspace. Switch stacks and treat it
  743. * as a normal entry. This means that paranoid handlers
  744. * run in real process context if user_mode(regs).
  745. */
  746. 1:
  747. call error_entry
  748. movq %rsp, %rdi /* pt_regs pointer */
  749. call sync_regs
  750. movq %rax, %rsp /* switch stack */
  751. movq %rsp, %rdi /* pt_regs pointer */
  752. .if \has_error_code
  753. movq ORIG_RAX(%rsp), %rsi /* get error code */
  754. movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
  755. .else
  756. xorl %esi, %esi /* no error code */
  757. .endif
  758. call \do_sym
  759. jmp error_exit
  760. .endif
  761. END(\sym)
  762. .endm
  763. #ifdef CONFIG_TRACING
  764. .macro trace_idtentry sym do_sym has_error_code:req
  765. idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
  766. idtentry \sym \do_sym has_error_code=\has_error_code
  767. .endm
  768. #else
  769. .macro trace_idtentry sym do_sym has_error_code:req
  770. idtentry \sym \do_sym has_error_code=\has_error_code
  771. .endm
  772. #endif
  773. idtentry divide_error do_divide_error has_error_code=0
  774. idtentry overflow do_overflow has_error_code=0
  775. idtentry bounds do_bounds has_error_code=0
  776. idtentry invalid_op do_invalid_op has_error_code=0
  777. idtentry device_not_available do_device_not_available has_error_code=0
  778. idtentry double_fault do_double_fault has_error_code=1 paranoid=2
  779. idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
  780. idtentry invalid_TSS do_invalid_TSS has_error_code=1
  781. idtentry segment_not_present do_segment_not_present has_error_code=1
  782. idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
  783. idtentry coprocessor_error do_coprocessor_error has_error_code=0
  784. idtentry alignment_check do_alignment_check has_error_code=1
  785. idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
  786. /*
  787. * Reload gs selector with exception handling
  788. * edi: new selector
  789. */
  790. ENTRY(native_load_gs_index)
  791. pushfq
  792. DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
  793. SWAPGS
  794. gs_change:
  795. movl %edi, %gs
  796. 2: mfence /* workaround */
  797. SWAPGS
  798. popfq
  799. ret
  800. END(native_load_gs_index)
  801. _ASM_EXTABLE(gs_change, bad_gs)
  802. .section .fixup, "ax"
  803. /* running with kernelgs */
  804. bad_gs:
  805. SWAPGS /* switch back to user gs */
  806. xorl %eax, %eax
  807. movl %eax, %gs
  808. jmp 2b
  809. .previous
  810. /* Call softirq on interrupt stack. Interrupts are off. */
  811. ENTRY(do_softirq_own_stack)
  812. pushq %rbp
  813. mov %rsp, %rbp
  814. incl PER_CPU_VAR(irq_count)
  815. cmove PER_CPU_VAR(irq_stack_ptr), %rsp
  816. push %rbp /* frame pointer backlink */
  817. call __do_softirq
  818. leaveq
  819. decl PER_CPU_VAR(irq_count)
  820. ret
  821. END(do_softirq_own_stack)
  822. #ifdef CONFIG_XEN
  823. idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  824. /*
  825. * A note on the "critical region" in our callback handler.
  826. * We want to avoid stacking callback handlers due to events occurring
  827. * during handling of the last event. To do this, we keep events disabled
  828. * until we've done all processing. HOWEVER, we must enable events before
  829. * popping the stack frame (can't be done atomically) and so it would still
  830. * be possible to get enough handler activations to overflow the stack.
  831. * Although unlikely, bugs of that kind are hard to track down, so we'd
  832. * like to avoid the possibility.
  833. * So, on entry to the handler we detect whether we interrupted an
  834. * existing activation in its critical region -- if so, we pop the current
  835. * activation and restart the handler using the previous one.
  836. */
  837. ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
  838. /*
  839. * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  840. * see the correct pointer to the pt_regs
  841. */
  842. movq %rdi, %rsp /* we don't return, adjust the stack frame */
  843. 11: incl PER_CPU_VAR(irq_count)
  844. movq %rsp, %rbp
  845. cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
  846. pushq %rbp /* frame pointer backlink */
  847. call xen_evtchn_do_upcall
  848. popq %rsp
  849. decl PER_CPU_VAR(irq_count)
  850. #ifndef CONFIG_PREEMPT
  851. call xen_maybe_preempt_hcall
  852. #endif
  853. jmp error_exit
  854. END(xen_do_hypervisor_callback)
  855. /*
  856. * Hypervisor uses this for application faults while it executes.
  857. * We get here for two reasons:
  858. * 1. Fault while reloading DS, ES, FS or GS
  859. * 2. Fault while executing IRET
  860. * Category 1 we do not need to fix up as Xen has already reloaded all segment
  861. * registers that could be reloaded and zeroed the others.
  862. * Category 2 we fix up by killing the current process. We cannot use the
  863. * normal Linux return path in this case because if we use the IRET hypercall
  864. * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
  865. * We distinguish between categories by comparing each saved segment register
  866. * with its current contents: any discrepancy means we in category 1.
  867. */
  868. ENTRY(xen_failsafe_callback)
  869. movl %ds, %ecx
  870. cmpw %cx, 0x10(%rsp)
  871. jne 1f
  872. movl %es, %ecx
  873. cmpw %cx, 0x18(%rsp)
  874. jne 1f
  875. movl %fs, %ecx
  876. cmpw %cx, 0x20(%rsp)
  877. jne 1f
  878. movl %gs, %ecx
  879. cmpw %cx, 0x28(%rsp)
  880. jne 1f
  881. /* All segments match their saved values => Category 2 (Bad IRET). */
  882. movq (%rsp), %rcx
  883. movq 8(%rsp), %r11
  884. addq $0x30, %rsp
  885. pushq $0 /* RIP */
  886. pushq %r11
  887. pushq %rcx
  888. jmp general_protection
  889. 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
  890. movq (%rsp), %rcx
  891. movq 8(%rsp), %r11
  892. addq $0x30, %rsp
  893. pushq $-1 /* orig_ax = -1 => not a system call */
  894. ALLOC_PT_GPREGS_ON_STACK
  895. SAVE_C_REGS
  896. SAVE_EXTRA_REGS
  897. jmp error_exit
  898. END(xen_failsafe_callback)
  899. apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
  900. xen_hvm_callback_vector xen_evtchn_do_upcall
  901. #endif /* CONFIG_XEN */
  902. #if IS_ENABLED(CONFIG_HYPERV)
  903. apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
  904. hyperv_callback_vector hyperv_vector_handler
  905. #endif /* CONFIG_HYPERV */
  906. idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
  907. idtentry int3 do_int3 has_error_code=0
  908. idtentry stack_segment do_stack_segment has_error_code=1
  909. #ifdef CONFIG_XEN
  910. idtentry xen_debug do_debug has_error_code=0
  911. idtentry xen_int3 do_int3 has_error_code=0
  912. idtentry xen_stack_segment do_stack_segment has_error_code=1
  913. #endif
  914. idtentry general_protection do_general_protection has_error_code=1
  915. trace_idtentry page_fault do_page_fault has_error_code=1
  916. #ifdef CONFIG_KVM_GUEST
  917. idtentry async_page_fault do_async_page_fault has_error_code=1
  918. #endif
  919. #ifdef CONFIG_X86_MCE
  920. idtentry machine_check do_mce has_error_code=0 paranoid=1
  921. #endif
  922. /*
  923. * Save all registers in pt_regs, and switch gs if needed.
  924. * Use slow, but surefire "are we in kernel?" check.
  925. *
  926. * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
  927. * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
  928. * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
  929. * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
  930. */
  931. ENTRY(paranoid_entry)
  932. cld
  933. SAVE_C_REGS 8
  934. SAVE_EXTRA_REGS 8
  935. movl $1, %ebx
  936. movl $MSR_GS_BASE, %ecx
  937. rdmsr
  938. testl %edx, %edx
  939. js 1f /* negative -> in kernel */
  940. SWAPGS
  941. xorl %ebx, %ebx
  942. 1:
  943. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  944. /*
  945. * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
  946. * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
  947. * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
  948. * unconditionally, but we need to find out whether the reverse
  949. * should be done on return (conveyed to paranoid_exit in %ebx).
  950. */
  951. ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
  952. testl $KAISER_SHADOW_PGD_OFFSET, %eax
  953. jz 2f
  954. orl $2, %ebx
  955. andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
  956. /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
  957. ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
  958. movq %rax, %cr3
  959. 2:
  960. #endif
  961. ret
  962. END(paranoid_entry)
  963. /*
  964. * "Paranoid" exit path from exception stack. This is invoked
  965. * only on return from non-NMI IST interrupts that came
  966. * from kernel space.
  967. *
  968. * We may be returning to very strange contexts (e.g. very early
  969. * in syscall entry), so checking for preemption here would
  970. * be complicated. Fortunately, we there's no good reason
  971. * to try to handle preemption here.
  972. *
  973. * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
  974. * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
  975. * ebx=2: needs both swapgs and SWITCH_USER_CR3
  976. * ebx=3: needs SWITCH_USER_CR3 but not swapgs
  977. */
  978. ENTRY(paranoid_exit)
  979. DISABLE_INTERRUPTS(CLBR_NONE)
  980. TRACE_IRQS_OFF_DEBUG
  981. TRACE_IRQS_IRETQ_DEBUG
  982. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  983. /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
  984. testl $2, %ebx /* SWITCH_USER_CR3 needed? */
  985. jz paranoid_exit_no_switch
  986. SWITCH_USER_CR3
  987. paranoid_exit_no_switch:
  988. #endif
  989. testl $1, %ebx /* swapgs needed? */
  990. jnz paranoid_exit_no_swapgs
  991. SWAPGS_UNSAFE_STACK
  992. paranoid_exit_no_swapgs:
  993. RESTORE_EXTRA_REGS
  994. RESTORE_C_REGS
  995. REMOVE_PT_GPREGS_FROM_STACK 8
  996. INTERRUPT_RETURN
  997. END(paranoid_exit)
  998. /*
  999. * Save all registers in pt_regs, and switch gs if needed.
  1000. */
  1001. ENTRY(error_entry)
  1002. cld
  1003. SAVE_C_REGS 8
  1004. SAVE_EXTRA_REGS 8
  1005. /*
  1006. * error_entry() always returns with a kernel gsbase and
  1007. * CR3. We must also have a kernel CR3/gsbase before
  1008. * calling TRACE_IRQS_*. Just unconditionally switch to
  1009. * the kernel CR3 here.
  1010. */
  1011. SWITCH_KERNEL_CR3
  1012. testb $3, CS+8(%rsp)
  1013. jz .Lerror_kernelspace
  1014. .Lerror_entry_from_usermode_swapgs:
  1015. /*
  1016. * We entered from user mode or we're pretending to have entered
  1017. * from user mode due to an IRET fault.
  1018. */
  1019. SWAPGS
  1020. .Lerror_entry_from_usermode_after_swapgs:
  1021. /*
  1022. * We need to tell lockdep that IRQs are off. We can't do this until
  1023. * we fix gsbase, and we should do it before enter_from_user_mode
  1024. * (which can take locks).
  1025. */
  1026. TRACE_IRQS_OFF
  1027. #ifdef CONFIG_CONTEXT_TRACKING
  1028. call enter_from_user_mode
  1029. #endif
  1030. ret
  1031. .Lerror_entry_done:
  1032. TRACE_IRQS_OFF
  1033. ret
  1034. /*
  1035. * There are two places in the kernel that can potentially fault with
  1036. * usergs. Handle them here. B stepping K8s sometimes report a
  1037. * truncated RIP for IRET exceptions returning to compat mode. Check
  1038. * for these here too.
  1039. */
  1040. .Lerror_kernelspace:
  1041. leaq native_irq_return_iret(%rip), %rcx
  1042. cmpq %rcx, RIP+8(%rsp)
  1043. je .Lerror_bad_iret
  1044. movl %ecx, %eax /* zero extend */
  1045. cmpq %rax, RIP+8(%rsp)
  1046. je .Lbstep_iret
  1047. cmpq $gs_change, RIP+8(%rsp)
  1048. jne .Lerror_entry_done
  1049. /*
  1050. * hack: gs_change can fail with user gsbase. If this happens, fix up
  1051. * gsbase and proceed. We'll fix up the exception and land in
  1052. * gs_change's error handler with kernel gsbase.
  1053. */
  1054. jmp .Lerror_entry_from_usermode_swapgs
  1055. .Lbstep_iret:
  1056. /* Fix truncated RIP */
  1057. movq %rcx, RIP+8(%rsp)
  1058. /* fall through */
  1059. .Lerror_bad_iret:
  1060. /*
  1061. * We came from an IRET to user mode, so we have user gsbase.
  1062. * Switch to kernel gsbase:
  1063. */
  1064. SWAPGS
  1065. /*
  1066. * Pretend that the exception came from user mode: set up pt_regs
  1067. * as if we faulted immediately after IRET.
  1068. */
  1069. mov %rsp, %rdi
  1070. call fixup_bad_iret
  1071. mov %rax, %rsp
  1072. jmp .Lerror_entry_from_usermode_after_swapgs
  1073. END(error_entry)
  1074. ENTRY(error_exit)
  1075. DISABLE_INTERRUPTS(CLBR_NONE)
  1076. TRACE_IRQS_OFF
  1077. testb $3, CS(%rsp)
  1078. jz retint_kernel
  1079. jmp retint_user
  1080. END(error_exit)
  1081. /* Runs on exception stack */
  1082. ENTRY(nmi)
  1083. /*
  1084. * Fix up the exception frame if we're on Xen.
  1085. * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
  1086. * one value to the stack on native, so it may clobber the rdx
  1087. * scratch slot, but it won't clobber any of the important
  1088. * slots past it.
  1089. *
  1090. * Xen is a different story, because the Xen frame itself overlaps
  1091. * the "NMI executing" variable.
  1092. */
  1093. PARAVIRT_ADJUST_EXCEPTION_FRAME
  1094. /*
  1095. * We allow breakpoints in NMIs. If a breakpoint occurs, then
  1096. * the iretq it performs will take us out of NMI context.
  1097. * This means that we can have nested NMIs where the next
  1098. * NMI is using the top of the stack of the previous NMI. We
  1099. * can't let it execute because the nested NMI will corrupt the
  1100. * stack of the previous NMI. NMI handlers are not re-entrant
  1101. * anyway.
  1102. *
  1103. * To handle this case we do the following:
  1104. * Check the a special location on the stack that contains
  1105. * a variable that is set when NMIs are executing.
  1106. * The interrupted task's stack is also checked to see if it
  1107. * is an NMI stack.
  1108. * If the variable is not set and the stack is not the NMI
  1109. * stack then:
  1110. * o Set the special variable on the stack
  1111. * o Copy the interrupt frame into an "outermost" location on the
  1112. * stack
  1113. * o Copy the interrupt frame into an "iret" location on the stack
  1114. * o Continue processing the NMI
  1115. * If the variable is set or the previous stack is the NMI stack:
  1116. * o Modify the "iret" location to jump to the repeat_nmi
  1117. * o return back to the first NMI
  1118. *
  1119. * Now on exit of the first NMI, we first clear the stack variable
  1120. * The NMI stack will tell any nested NMIs at that point that it is
  1121. * nested. Then we pop the stack normally with iret, and if there was
  1122. * a nested NMI that updated the copy interrupt stack frame, a
  1123. * jump will be made to the repeat_nmi code that will handle the second
  1124. * NMI.
  1125. *
  1126. * However, espfix prevents us from directly returning to userspace
  1127. * with a single IRET instruction. Similarly, IRET to user mode
  1128. * can fault. We therefore handle NMIs from user space like
  1129. * other IST entries.
  1130. */
  1131. ASM_CLAC
  1132. /* Use %rdx as our temp variable throughout */
  1133. pushq %rdx
  1134. testb $3, CS-RIP+8(%rsp)
  1135. jz .Lnmi_from_kernel
  1136. /*
  1137. * NMI from user mode. We need to run on the thread stack, but we
  1138. * can't go through the normal entry paths: NMIs are masked, and
  1139. * we don't want to enable interrupts, because then we'll end
  1140. * up in an awkward situation in which IRQs are on but NMIs
  1141. * are off.
  1142. *
  1143. * We also must not push anything to the stack before switching
  1144. * stacks lest we corrupt the "NMI executing" variable.
  1145. */
  1146. SWAPGS_UNSAFE_STACK
  1147. /*
  1148. * percpu variables are mapped with user CR3, so no need
  1149. * to switch CR3 here.
  1150. */
  1151. cld
  1152. movq %rsp, %rdx
  1153. movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  1154. pushq 5*8(%rdx) /* pt_regs->ss */
  1155. pushq 4*8(%rdx) /* pt_regs->rsp */
  1156. pushq 3*8(%rdx) /* pt_regs->flags */
  1157. pushq 2*8(%rdx) /* pt_regs->cs */
  1158. pushq 1*8(%rdx) /* pt_regs->rip */
  1159. pushq $-1 /* pt_regs->orig_ax */
  1160. pushq %rdi /* pt_regs->di */
  1161. pushq %rsi /* pt_regs->si */
  1162. pushq (%rdx) /* pt_regs->dx */
  1163. pushq %rcx /* pt_regs->cx */
  1164. pushq %rax /* pt_regs->ax */
  1165. pushq %r8 /* pt_regs->r8 */
  1166. pushq %r9 /* pt_regs->r9 */
  1167. pushq %r10 /* pt_regs->r10 */
  1168. pushq %r11 /* pt_regs->r11 */
  1169. pushq %rbx /* pt_regs->rbx */
  1170. pushq %rbp /* pt_regs->rbp */
  1171. pushq %r12 /* pt_regs->r12 */
  1172. pushq %r13 /* pt_regs->r13 */
  1173. pushq %r14 /* pt_regs->r14 */
  1174. pushq %r15 /* pt_regs->r15 */
  1175. /*
  1176. * At this point we no longer need to worry about stack damage
  1177. * due to nesting -- we're on the normal thread stack and we're
  1178. * done with the NMI stack.
  1179. */
  1180. movq %rsp, %rdi
  1181. movq $-1, %rsi
  1182. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  1183. /* Unconditionally use kernel CR3 for do_nmi() */
  1184. /* %rax is saved above, so OK to clobber here */
  1185. ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
  1186. /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
  1187. ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
  1188. pushq %rax
  1189. /* mask off "user" bit of pgd address and 12 PCID bits: */
  1190. andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
  1191. movq %rax, %cr3
  1192. 2:
  1193. #endif
  1194. call do_nmi
  1195. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  1196. /*
  1197. * Unconditionally restore CR3. I know we return to
  1198. * kernel code that needs user CR3, but do we ever return
  1199. * to "user mode" where we need the kernel CR3?
  1200. */
  1201. ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
  1202. #endif
  1203. /*
  1204. * Return back to user mode. We must *not* do the normal exit
  1205. * work, because we don't want to enable interrupts. Do not
  1206. * switch to user CR3: we might be going back to kernel code
  1207. * that had a user CR3 set.
  1208. */
  1209. SWAPGS
  1210. jmp restore_c_regs_and_iret
  1211. .Lnmi_from_kernel:
  1212. /*
  1213. * Here's what our stack frame will look like:
  1214. * +---------------------------------------------------------+
  1215. * | original SS |
  1216. * | original Return RSP |
  1217. * | original RFLAGS |
  1218. * | original CS |
  1219. * | original RIP |
  1220. * +---------------------------------------------------------+
  1221. * | temp storage for rdx |
  1222. * +---------------------------------------------------------+
  1223. * | "NMI executing" variable |
  1224. * +---------------------------------------------------------+
  1225. * | iret SS } Copied from "outermost" frame |
  1226. * | iret Return RSP } on each loop iteration; overwritten |
  1227. * | iret RFLAGS } by a nested NMI to force another |
  1228. * | iret CS } iteration if needed. |
  1229. * | iret RIP } |
  1230. * +---------------------------------------------------------+
  1231. * | outermost SS } initialized in first_nmi; |
  1232. * | outermost Return RSP } will not be changed before |
  1233. * | outermost RFLAGS } NMI processing is done. |
  1234. * | outermost CS } Copied to "iret" frame on each |
  1235. * | outermost RIP } iteration. |
  1236. * +---------------------------------------------------------+
  1237. * | pt_regs |
  1238. * +---------------------------------------------------------+
  1239. *
  1240. * The "original" frame is used by hardware. Before re-enabling
  1241. * NMIs, we need to be done with it, and we need to leave enough
  1242. * space for the asm code here.
  1243. *
  1244. * We return by executing IRET while RSP points to the "iret" frame.
  1245. * That will either return for real or it will loop back into NMI
  1246. * processing.
  1247. *
  1248. * The "outermost" frame is copied to the "iret" frame on each
  1249. * iteration of the loop, so each iteration starts with the "iret"
  1250. * frame pointing to the final return target.
  1251. */
  1252. /*
  1253. * Determine whether we're a nested NMI.
  1254. *
  1255. * If we interrupted kernel code between repeat_nmi and
  1256. * end_repeat_nmi, then we are a nested NMI. We must not
  1257. * modify the "iret" frame because it's being written by
  1258. * the outer NMI. That's okay; the outer NMI handler is
  1259. * about to about to call do_nmi anyway, so we can just
  1260. * resume the outer NMI.
  1261. */
  1262. movq $repeat_nmi, %rdx
  1263. cmpq 8(%rsp), %rdx
  1264. ja 1f
  1265. movq $end_repeat_nmi, %rdx
  1266. cmpq 8(%rsp), %rdx
  1267. ja nested_nmi_out
  1268. 1:
  1269. /*
  1270. * Now check "NMI executing". If it's set, then we're nested.
  1271. * This will not detect if we interrupted an outer NMI just
  1272. * before IRET.
  1273. */
  1274. cmpl $1, -8(%rsp)
  1275. je nested_nmi
  1276. /*
  1277. * Now test if the previous stack was an NMI stack. This covers
  1278. * the case where we interrupt an outer NMI after it clears
  1279. * "NMI executing" but before IRET. We need to be careful, though:
  1280. * there is one case in which RSP could point to the NMI stack
  1281. * despite there being no NMI active: naughty userspace controls
  1282. * RSP at the very beginning of the SYSCALL targets. We can
  1283. * pull a fast one on naughty userspace, though: we program
  1284. * SYSCALL to mask DF, so userspace cannot cause DF to be set
  1285. * if it controls the kernel's RSP. We set DF before we clear
  1286. * "NMI executing".
  1287. */
  1288. lea 6*8(%rsp), %rdx
  1289. /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
  1290. cmpq %rdx, 4*8(%rsp)
  1291. /* If the stack pointer is above the NMI stack, this is a normal NMI */
  1292. ja first_nmi
  1293. subq $EXCEPTION_STKSZ, %rdx
  1294. cmpq %rdx, 4*8(%rsp)
  1295. /* If it is below the NMI stack, it is a normal NMI */
  1296. jb first_nmi
  1297. /* Ah, it is within the NMI stack. */
  1298. testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
  1299. jz first_nmi /* RSP was user controlled. */
  1300. /* This is a nested NMI. */
  1301. nested_nmi:
  1302. /*
  1303. * Modify the "iret" frame to point to repeat_nmi, forcing another
  1304. * iteration of NMI handling.
  1305. */
  1306. subq $8, %rsp
  1307. leaq -10*8(%rsp), %rdx
  1308. pushq $__KERNEL_DS
  1309. pushq %rdx
  1310. pushfq
  1311. pushq $__KERNEL_CS
  1312. pushq $repeat_nmi
  1313. /* Put stack back */
  1314. addq $(6*8), %rsp
  1315. nested_nmi_out:
  1316. popq %rdx
  1317. /* We are returning to kernel mode, so this cannot result in a fault. */
  1318. INTERRUPT_RETURN
  1319. first_nmi:
  1320. /* Restore rdx. */
  1321. movq (%rsp), %rdx
  1322. /* Make room for "NMI executing". */
  1323. pushq $0
  1324. /* Leave room for the "iret" frame */
  1325. subq $(5*8), %rsp
  1326. /* Copy the "original" frame to the "outermost" frame */
  1327. .rept 5
  1328. pushq 11*8(%rsp)
  1329. .endr
  1330. /* Everything up to here is safe from nested NMIs */
  1331. #ifdef CONFIG_DEBUG_ENTRY
  1332. /*
  1333. * For ease of testing, unmask NMIs right away. Disabled by
  1334. * default because IRET is very expensive.
  1335. */
  1336. pushq $0 /* SS */
  1337. pushq %rsp /* RSP (minus 8 because of the previous push) */
  1338. addq $8, (%rsp) /* Fix up RSP */
  1339. pushfq /* RFLAGS */
  1340. pushq $__KERNEL_CS /* CS */
  1341. pushq $1f /* RIP */
  1342. INTERRUPT_RETURN /* continues at repeat_nmi below */
  1343. 1:
  1344. #endif
  1345. repeat_nmi:
  1346. /*
  1347. * If there was a nested NMI, the first NMI's iret will return
  1348. * here. But NMIs are still enabled and we can take another
  1349. * nested NMI. The nested NMI checks the interrupted RIP to see
  1350. * if it is between repeat_nmi and end_repeat_nmi, and if so
  1351. * it will just return, as we are about to repeat an NMI anyway.
  1352. * This makes it safe to copy to the stack frame that a nested
  1353. * NMI will update.
  1354. *
  1355. * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
  1356. * we're repeating an NMI, gsbase has the same value that it had on
  1357. * the first iteration. paranoid_entry will load the kernel
  1358. * gsbase if needed before we call do_nmi. "NMI executing"
  1359. * is zero.
  1360. */
  1361. movq $1, 10*8(%rsp) /* Set "NMI executing". */
  1362. /*
  1363. * Copy the "outermost" frame to the "iret" frame. NMIs that nest
  1364. * here must not modify the "iret" frame while we're writing to
  1365. * it or it will end up containing garbage.
  1366. */
  1367. addq $(10*8), %rsp
  1368. .rept 5
  1369. pushq -6*8(%rsp)
  1370. .endr
  1371. subq $(5*8), %rsp
  1372. end_repeat_nmi:
  1373. /*
  1374. * Everything below this point can be preempted by a nested NMI.
  1375. * If this happens, then the inner NMI will change the "iret"
  1376. * frame to point back to repeat_nmi.
  1377. */
  1378. pushq $-1 /* ORIG_RAX: no syscall to restart */
  1379. ALLOC_PT_GPREGS_ON_STACK
  1380. /*
  1381. * Use the same approach as paranoid_entry to handle SWAPGS, but
  1382. * without CR3 handling since we do that differently in NMIs. No
  1383. * need to use paranoid_exit as we should not be calling schedule
  1384. * in NMI context. Even with normal interrupts enabled. An NMI
  1385. * should not be setting NEED_RESCHED or anything that normal
  1386. * interrupts and exceptions might do.
  1387. */
  1388. cld
  1389. SAVE_C_REGS
  1390. SAVE_EXTRA_REGS
  1391. movl $1, %ebx
  1392. movl $MSR_GS_BASE, %ecx
  1393. rdmsr
  1394. testl %edx, %edx
  1395. js 1f /* negative -> in kernel */
  1396. SWAPGS
  1397. xorl %ebx, %ebx
  1398. 1:
  1399. movq %rsp, %rdi
  1400. movq $-1, %rsi
  1401. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  1402. /* Unconditionally use kernel CR3 for do_nmi() */
  1403. /* %rax is saved above, so OK to clobber here */
  1404. ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
  1405. /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
  1406. ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
  1407. pushq %rax
  1408. /* mask off "user" bit of pgd address and 12 PCID bits: */
  1409. andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
  1410. movq %rax, %cr3
  1411. 2:
  1412. #endif
  1413. /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
  1414. call do_nmi
  1415. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  1416. /*
  1417. * Unconditionally restore CR3. We might be returning to
  1418. * kernel code that needs user CR3, like just just before
  1419. * a sysret.
  1420. */
  1421. ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
  1422. #endif
  1423. testl %ebx, %ebx /* swapgs needed? */
  1424. jnz nmi_restore
  1425. nmi_swapgs:
  1426. /* We fixed up CR3 above, so no need to switch it here */
  1427. SWAPGS_UNSAFE_STACK
  1428. nmi_restore:
  1429. RESTORE_EXTRA_REGS
  1430. RESTORE_C_REGS
  1431. /* Point RSP at the "iret" frame. */
  1432. REMOVE_PT_GPREGS_FROM_STACK 6*8
  1433. /*
  1434. * Clear "NMI executing". Set DF first so that we can easily
  1435. * distinguish the remaining code between here and IRET from
  1436. * the SYSCALL entry and exit paths. On a native kernel, we
  1437. * could just inspect RIP, but, on paravirt kernels,
  1438. * INTERRUPT_RETURN can translate into a jump into a
  1439. * hypercall page.
  1440. */
  1441. std
  1442. movq $0, 5*8(%rsp) /* clear "NMI executing" */
  1443. /*
  1444. * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
  1445. * stack in a single instruction. We are returning to kernel
  1446. * mode, so this cannot result in a fault.
  1447. */
  1448. INTERRUPT_RETURN
  1449. END(nmi)
  1450. ENTRY(ignore_sysret)
  1451. mov $-ENOSYS, %eax
  1452. sysret
  1453. END(ignore_sysret)