zoukankan      html  css  js  c++  java
  • gvisor entersyscall exitsyscall

    The Sentry runs in both GR0 and HR3. The bluepill code is responsible for transparently bouncing the Sentry between these two modes, with the following transition events:

    In HR3:
    CLI (disable interrupts) => switch to GR0
     
    In GR0:
    Fault => switch to HR3
    System call => switch HR3
     
    syscall() ---> sysenter() ----->kernelSyscall()(Notice: this place focus on ring0 kernel) ----> "HLT" instruction ------>  kvm_emulate_halt() in the kvm (trap to KVM)  ----> bluepillHandler() to handle _KVM_EXIT_HLT ----> c.notify() ----> syscall.RawSyscall6(syscall.SYS_FUTEX,....)
     
    I also checked the asm code for syscall.RawSyscall6() in https://golang.org/src/syscall/asm_darwin_amd64.s?h=RawSyscall
     
    //go:nosplit
    func (c *vCPU) notify() {
        _, _, errno := syscall.RawSyscall6(
            syscall.SYS_FUTEX,
            uintptr(unsafe.Pointer(&c.state)),
            linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
            math.MaxInt32, // Number of waiters.
            0, 0, 0)
        if errno != 0 {
            throw("futex wake error")
        }
    }

    The HLT trigger a VMEXIT, which manifests as a return to the KVM_RUN ioctl on the host side. This is in the host signal handler. The state is then copied into the signal frame, and signal_return is called.

    entersyscall 和exitsyscall 是golang runtime的entersyscall和 exitsyscall

            entersyscall()
            bluepill(c)
            vector = c.CPU.SwitchToUser(switchOpts)
            exitsyscall()
    
            switch vector {
            case ring0.Syscall, ring0.SyscallInt80:
                    // Fast path: system call executed.
                    return usermem.NoAccess, nil
    
            case ring0.PageFault:
                    return c.fault(int32(syscall.SIGSEGV), info)
    
            case ring0.Debug, ring0.Breakpoint:
                    *info = arch.SignalInfo{
                            Signo: int32(syscall.SIGTRAP),
                            Code:  1, // TRAP_BRKPT (breakpoint).
                    }
                    info.SetAddr(switchOpts.Registers.Rip) // Include address.
                    return usermem.AccessType{}, platform.ErrContextSignal
    entry_impl_amd64.s:44:#define SyscallInt80               0x80
    entry_impl_amd64.s:45:#define Syscall                    0x100
    func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
            userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
            c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
    
            // Sanitize registers.
            regs := switchOpts.Registers
            regs.Eflags &= ^uint64(UserFlagsClear)
            regs.Eflags |= UserFlagsSet
            regs.Cs = uint64(Ucode64) // Required for iret.
            regs.Ss = uint64(Udata)   // Ditto.
    
            // Perform the switch.
            swapgs()                                         // GS will be swapped on return.
            WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
            WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
            LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
            if switchOpts.FullRestore {
                    vector = iret(c, regs, uintptr(userCR3))   //sysenter
     } else 
    {
    vector
    = sysret(c, regs, uintptr(userCR3))
    }
    SaveFloatingPoint(switchOpts.FloatingPointState)

    // escapes: no. Copy out floating point. WriteFS(uintptr(c.registers.Fs_base))

    // escapes: no. Restore kernel FS.

    return

    }

    guest 执行代码

     // Set the entrypoint for the kernel.
            kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())           //代码首地址
            kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
            kernelUserRegs.RSP = c.StackTop()
            kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
    //go:nosplit
    func start(c *CPU) {
        // Save per-cpu & FS segment.
        WriteGS(kernelAddr(c.kernelEntry))
        WriteFS(uintptr(c.registers.Fs_base))
    
        // Initialize floating point.
        //
        // Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
        // This breaks down as:
        //
        //    bit0   - x87
        //    bit1   - SSE
        //    bit2   - AVX
        //    bit3-4 - MPX
        //    bit5-7 - AVX512
        //
        // For some reason, enabled MPX & AVX512 on platforms that report them
        // seems to be cause a general protection fault. (Maybe there are some
        // virtualization issues and these aren't exported to the guest cpuid.)
        // This needs further investigation, but we can limit the floating
        // point operations to x87, SSE & AVX for now.
        fninit()
        xsetbv(0, validXCR0Mask&0x7)
    
        // Set the syscall target.
        wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
        wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
    
        // NOTE: This depends on having the 64-bit segments immediately
        // following the 32-bit user segments. This is simply the way the
        // sysret instruction is designed to work (it assumes they follow).
        wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
        wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
    }

    sysenter

    / Set the syscall target.
            wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) // sysenter
    // See entry_amd64.go.
    TEXT ·sysenter(SB),NOSPLIT,$0
            // _RFLAGS_IOPL0 is always set in the user mode and it is never set in
            // the kernel mode. See the comment of UserFlagsSet for more details.
            TESTL $_RFLAGS_IOPL0, R11
            JZ kernel
    user:
            SWAP_GS()
            MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
            MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
            WRITE_CR3()                            // Switch to kernel cr3.
    
            MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
            MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
            REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
            MOVQ CX,  PTRACE_RIP(AX)
            MOVQ R11, PTRACE_FLAGS(AX)
            MOVQ SP,  PTRACE_RSP(AX)
            MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
            MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
            MOVQ CX,  PTRACE_ORIGRAX(AX)
    
            MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
            MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
            MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
            MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.
    
            // Return to the kernel, where the frame is:
            //
            //      vector      (sp+32)
            //      userCR3     (sp+24)
            //      regs        (sp+16)
            //      cpu         (sp+8)
            //      vcpu.Switch (sp+0)
            //
            MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
            MOVQ $Syscall, 32(SP)                 // Output vector.
            RET
    kernel:
            // We can't restore the original stack, but we can access the registers
            // in the CPU state directly. No need for temporary juggling.
            MOVQ AX,  ENTRY_SCRATCH0(GS)
            MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
            REGISTERS_SAVE(AX, CPU_REGISTERS)
            MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
            MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
            MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
            MOVQ ENTRY_SCRATCH0(GS), BX
            MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
            MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
            MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
            MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.
    
            // Call the syscall trampoline.
            LOAD_KERNEL_STACK(GS)
            PUSHQ AX                // First argument (vCPU).
            CALL ·kernelSyscall(SB) // Call the trampoline.
            POPQ AX                 // Pop vCPU.
            JMP ·resume(SB)

    linux 系统调用实现

     注册系统调用

    register_syscall:
      xor rax, rax
      mov rdx, 0x00200008
      mov ecx, 0xc0000081 /* MSR_STAR */
      wrmsr
    
      mov eax, 0x3f7fd5
      xor rdx, rdx
      mov ecx, 0xc0000084 /* MSR_SYSCALL_MASK */
      wrmsr
    
      lea rdi, [rip + syscall_handler]
      mov eax, edi
      mov rdx, rdi
      shr rdx, 32
      mov ecx, 0xc0000082 /* MSR_LSTAR */
      wrmsr
    .globl syscall_handler, kernel_stack
    .extern do_handle_syscall
    .intel_syntax noprefix
    
    kernel_stack: .quad 0 /* initialize it before the first time switching into user-mode */
    user_stack: .quad 0
    
    syscall_handler:
      mov [rip + user_stack], rsp
      mov rsp, [rip + kernel_stack]
      /* save non-callee-saved registers */
      push rdi
      push rsi
      push rdx
      push rcx
      push r8
      push r9
      push r10
      push r11
    
      /* the forth argument */
      mov rcx, r10
      call do_handle_syscall
    
      pop r11
      pop r10
      pop r9
      pop r8
      pop rcx
      pop rdx
      pop rsi
      pop rdi
    
      mov rsp, [rip + user_stack]
      .byte 0x48 /* REX.W prefix, to indicate sysret is a 64-bit instruction */
      sysret
  • 相关阅读:
    数据结构作业-二叉树
    51nod 1163 最高的奖励
    51nod 1091 线段的重叠
    实验1 顺序表及其应用
    51nod 1459 迷宫游戏 dijkstra模板
    html5拖拽
    onbeforeunload、onpagehide、onunload、onload、onpageshow的正确执行顺序
    HTML5游戏2D开发引擎
    如何定义现代浏览器
    api文档设计工具:RAML、Swagger
  • 原文地址:https://www.cnblogs.com/dream397/p/14304698.html
Copyright © 2011-2022 走看看