The Sentry runs in both GR0 and HR3. The bluepill code is responsible for transparently bouncing the Sentry between these two modes, with the following transition events:
In HR3:
CLI (disable interrupts) => switch to GR0
In GR0:
Fault => switch to HR3
System call => switch HR3
syscall() ---> sysenter() ----->kernelSyscall()(Notice: this place focus on ring0 kernel) ----> "HLT" instruction ------> kvm_emulate_halt() in the kvm (trap to KVM) ----> bluepillHandler() to handle _KVM_EXIT_HLT ----> c.notify() ----> syscall.RawSyscall6(syscall.SYS_FUTEX,....)I also checked the asm code for syscall.RawSyscall6() in https://golang.org/src/syscall/asm_darwin_amd64.s?h=RawSyscall//go:nosplitfunc (c *vCPU) notify() {_, _, errno := syscall.RawSyscall6(syscall.SYS_FUTEX,uintptr(unsafe.Pointer(&c.state)),linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,math.MaxInt32, // Number of waiters.0, 0, 0)if errno != 0 {throw("futex wake error")}}
The HLT trigger a VMEXIT, which manifests as a return to the KVM_RUN ioctl on the host side. This is in the host signal handler. The state is then copied into the signal frame, and signal_return is called.
entersyscall 和exitsyscall 是golang runtime的entersyscall和 exitsyscall
entersyscall() bluepill(c) vector = c.CPU.SwitchToUser(switchOpts) exitsyscall() switch vector { case ring0.Syscall, ring0.SyscallInt80: // Fast path: system call executed. return usermem.NoAccess, nil case ring0.PageFault: return c.fault(int32(syscall.SIGSEGV), info) case ring0.Debug, ring0.Breakpoint: *info = arch.SignalInfo{ Signo: int32(syscall.SIGTRAP), Code: 1, // TRAP_BRKPT (breakpoint). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return usermem.AccessType{}, platform.ErrContextSignal
entry_impl_amd64.s:44:#define SyscallInt80 0x80 entry_impl_amd64.s:45:#define Syscall 0x100
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) // Sanitize registers. regs := switchOpts.Registers regs.Eflags &= ^uint64(UserFlagsClear) regs.Eflags |= UserFlagsSet regs.Cs = uint64(Ucode64) // Required for iret. regs.Ss = uint64(Udata) // Ditto. // Perform the switch. swapgs() // GS will be swapped on return. WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. if switchOpts.FullRestore { vector = iret(c, regs, uintptr(userCR3)) //sysenter
} else
{
vector = sysret(c, regs, uintptr(userCR3))
}
SaveFloatingPoint(switchOpts.FloatingPointState)
// escapes: no. Copy out floating point. WriteFS(uintptr(c.registers.Fs_base))
// escapes: no. Restore kernel FS.
return
}
guest 执行代码
// Set the entrypoint for the kernel. kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) //代码首地址 kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
//go:nosplit func start(c *CPU) { // Save per-cpu & FS segment. WriteGS(kernelAddr(c.kernelEntry)) WriteFS(uintptr(c.registers.Fs_base)) // Initialize floating point. // // Note that on skylake, the valid XCR0 mask reported seems to be 0xff. // This breaks down as: // // bit0 - x87 // bit1 - SSE // bit2 - AVX // bit3-4 - MPX // bit5-7 - AVX512 // // For some reason, enabled MPX & AVX512 on platforms that report them // seems to be cause a general protection fault. (Maybe there are some // virtualization issues and these aren't exported to the guest cpuid.) // This needs further investigation, but we can limit the floating // point operations to x87, SSE & AVX for now. fninit() xsetbv(0, validXCR0Mask&0x7) // Set the syscall target. wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) // NOTE: This depends on having the 64-bit segments immediately // following the 32-bit user segments. This is simply the way the // sysret instruction is designed to work (it assumes they follow). wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) }
sysenter
/ Set the syscall target.
wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) // sysenter
// See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in // the kernel mode. See the comment of UserFlagsSet for more details. TESTL $_RFLAGS_IOPL0, R11 JZ kernel user: SWAP_GS() MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. WRITE_CR3() // Switch to kernel cr3. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. MOVQ CX, PTRACE_RIP(AX) MOVQ R11, PTRACE_FLAGS(AX) MOVQ SP, PTRACE_RSP(AX) MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. MOVQ CX, PTRACE_RAX(AX) // Save everything else. MOVQ CX, PTRACE_ORIGRAX(AX) MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. // Return to the kernel, where the frame is: // // vector (sp+32) // userCR3 (sp+24) // regs (sp+16) // cpu (sp+8) // vcpu.Switch (sp+0) // MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. MOVQ $Syscall, 32(SP) // Output vector. RET
kernel: // We can't restore the original stack, but we can access the registers // in the CPU state directly. No need for temporary juggling. MOVQ AX, ENTRY_SCRATCH0(GS) MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. REGISTERS_SAVE(AX, CPU_REGISTERS) MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) MOVQ ENTRY_SCRATCH0(GS), BX MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) PUSHQ AX // First argument (vCPU). CALL ·kernelSyscall(SB) // Call the trampoline. POPQ AX // Pop vCPU. JMP ·resume(SB)
linux 系统调用实现
注册系统调用
register_syscall: xor rax, rax mov rdx, 0x00200008 mov ecx, 0xc0000081 /* MSR_STAR */ wrmsr mov eax, 0x3f7fd5 xor rdx, rdx mov ecx, 0xc0000084 /* MSR_SYSCALL_MASK */ wrmsr lea rdi, [rip + syscall_handler] mov eax, edi mov rdx, rdi shr rdx, 32 mov ecx, 0xc0000082 /* MSR_LSTAR */ wrmsr
.globl syscall_handler, kernel_stack .extern do_handle_syscall .intel_syntax noprefix kernel_stack: .quad 0 /* initialize it before the first time switching into user-mode */ user_stack: .quad 0 syscall_handler: mov [rip + user_stack], rsp mov rsp, [rip + kernel_stack] /* save non-callee-saved registers */ push rdi push rsi push rdx push rcx push r8 push r9 push r10 push r11 /* the forth argument */ mov rcx, r10 call do_handle_syscall pop r11 pop r10 pop r9 pop r8 pop rcx pop rdx pop rsi pop rdi mov rsp, [rip + user_stack] .byte 0x48 /* REX.W prefix, to indicate sysret is a 64-bit instruction */ sysret