(一)首先,系统调用有两种方式:
- 0x80、0x81、0x82三个中断号;
- 专门指令(至少分Intel架构和ARM架构),比如SYSENTER/SYSCALL
(二)话分两头,先说中断向量方式
这是终端向量定义的部分代码:
INTERRUPT(0x7d) INTERRUPT(0x7e) USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */ USER_TRAP_SPC(0x80,idt64_unix_scall) USER_TRAP_SPC(0x81,idt64_mach_scall) USER_TRAP_SPC(0x82,idt64_mdep_scall) INTERRUPT(0x83) INTERRUPT(0x84) INTERRUPT(0x85) INTERRUPT(0x86)
(BSD风格的系统调用,终端号就是0x80)
触发中断以及后面的逻辑,都在汇编文件idt64.s中实现,下面简单看看:
/* * System call handlers. * These are entered via a syscall interrupt. The system call number in %rax * is saved to the error code slot in the stack frame. We then branch to the * common state saving code. */ #ifndef UNIX_INT #error NO UNIX INT!!! #endif Entry(idt64_unix_scall) swapgs /* switch to kernel gs (cpu_data) */ pushq %rax /* save system call number */ PUSH_FUNCTION(HNDL_UNIX_SCALL) pushq $(UNIX_INT)
接下来执行PUSH_FUNCTIOIN(HNDL_UNIX_SCALL),先展开PUSH_FUNCTION看看:
#if 1 #define PUSH_FUNCTION(func) sub $8, %rsp ; push %rax ; leaq func(%rip), %rax ; movq %rax, 8(%rsp) ; pop %rax #else #define PUSH_FUNCTION(func) pushq func #endif
系统调用号,在寄存器RAX,接下来看看HNDL_UNIX_SCALL:
Entry(hndl_unix_scall) TIME_TRAP_UENTRY movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) sti CCALL1(unix_syscall, %r15) /* * always returns through thread_exception_return */
主要有一行:unix_syscall,看看unix_syscall函数的definition:
/* * Function: unix_syscall * * Inputs: regs - pointer to i386 save area * * Outputs: none */ void unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int error; vm_offset_t params; struct proc *p; struct uthread *uthread; x86_saved_state32_t *regs; boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); #if DEBUG if (regs->eax == 0x800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ is_vfork = uthread->uu_flag & UT_VFORK; if (__improbable(is_vfork != 0)) p = current_proc(); else p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u ", code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; }
.........
通过寄存器中的数据得到code,再通过code取得数组sysent中的系统调用函数,交给callp;后面的代码冗长,这里就不全部贴出来咯。
(关于sysent数组,改天详述)
(三)再说系统调用专用指令方式(以Intel架构为例)
SYSENTER用于32位,SYSCALL用于64位,只说SYSCALL吧,先看汇编:
Entry(hi64_syscall) Entry(idt64_syscall) L_syscall_continue: swapgs /* Kapow! get per-cpu data area */ mov %rsp, %gs:CPU_UBER_TMP /* save user stack */ mov %gs:CPU_UBER_ISF, %rsp /* switch stack to pcb */ /* * Save values in the ISF frame in the PCB * to cons up the saved machine state. */ movl $(USER_DS), ISF64_SS(%rsp) movl $(SYSCALL_CS), ISF64_CS(%rsp) /* cs - a pseudo-segment */ mov %r11, ISF64_RFLAGS(%rsp) /* rflags */ mov %rcx, ISF64_RIP(%rsp) /* rip */ mov %gs:CPU_UBER_TMP, %rcx mov %rcx, ISF64_RSP(%rsp) /* user stack */ mov %rax, ISF64_ERR(%rsp) /* err/rax - syscall code */ movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */ leaq HNDL_SYSCALL(%rip), %r11; movq %r11, ISF64_TRAPFN(%rsp) mov ISF64_RFLAGS(%rsp), %r11 /* Avoid leak, restore R11 */ jmp L_dispatch_U64 /* this can only be 64-bit */
主要看看HNDL_SYSCALL:
/* * 64bit Tasks * System call entries via syscall only: * * r15 x86_saved_state64_t * rsp kernel stack * * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ Entry(hndl_syscall) TIME_TRAP_UENTRY movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) /* * We can be here either for a mach, unix machdep or diag syscall, * as indicated by the syscall class: */ movl R64_RAX(%r15), %eax /* syscall number/class */ movl %eax, %edx andl $(SYSCALL_CLASS_MASK), %edx /* syscall class */ cmpl $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_mach_scall64) cmpl $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_unix_scall64) cmpl $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_mdep_scall64) cmpl $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_diag_scall64) /* Syscall class unknown */ sti CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1) /* no return */
可以看到,这里根据寄存器和全局参数区分4种系统调用,BSD风格的系统调用只是第1种,还有3种:mach syscall、machdep syscall、diag syscall;
如果是BSD风格系统调用,那么就继续执行hndl_unix_scall64:
Entry(hndl_unix_scall64) incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ sti CCALL1(unix_syscall64, %r15) /* * always returns through thread_exception_return */
只有一个函数调用,unix_syscall64,接下来看看这个函数的definition:
void unix_syscall64(x86_saved_state_t *state) { thread_t thread; unsigned int code; struct sysent *callp; void *uargp; int args_in_regs; int error; struct proc *p; struct uthread *uthread; x86_saved_state64_t *regs; assert(is_saved_state64(state)); regs = saved_state64(state); #if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } args_in_regs = 6; code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx ", code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; uargp = (void *)(®s->rdi); if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' */ code = regs->rdi; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; uargp = (void *)(®s->rsi); args_in_regs = 5; }
..........
可以看到这里首先从x86_saved_state_t中取得系统调用号code,然后从数组sysent中得到系统调用函数,给callp;再后面是一些参数处理,和callp的执行。
接下去就到了具体的系统调用函数。
(大概介绍如上,有人拍砖吗?一起了解啊~)