1. 系统调用初始化流程
内核在完成初始化后, 执行的第一个内核程序是init/main.c中定义的asmlinkage void __init start_kernel(void)启动内核; start_kernel()执行时, 又会调用arch/x86/kernel/traps.c中定义的void __init trap_init(void)初始化陷阱门及中断门;trap_init()通过执行set_system_gate(SYSCALL_VECTOR, &system_call)完成系统调用的挂接。
init/main.c: 500 asmlinkage __visible void __init start_kernel(void) 501 { 502 char *command_line; 503 char *after_dashes; ... 560 sort_main_extable(); 561 trap_init(); // 初始化陷阱门及中断门 562 mm_init(); ... } arch/x86/kernel/traps.c: 792 void __init trap_init(void) 793 { 794 int i; 795 796 #ifdef CONFIG_EISA 797 void __iomem *p = early_ioremap(0x0FFFD9, 4); 798 799 if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) 800 EISA_bus = 1; 801 early_iounmap(p, 4); 802 #endif 803 804 set_intr_gate(X86_TRAP_DE, divide_error); 805 set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); 806 /* int4 can be called from all */ 807 set_system_intr_gate(X86_TRAP_OF, &overflow); ... 837 838 #ifdef CONFIG_X86_32 839 set_system_trap_gate(SYSCALL_VECTOR, &system_call); // 初始化系统调用 840 set_bit(SYSCALL_VECTOR, used_vectors); 841 #endif 842 ... } arch/x86/include/asm/irq_vectors.h: 49 #define IA32_SYSCALL_VECTOR 0x80 50 #ifdef CONFIG_X86_32 51 # define SYSCALL_VECTOR 0x80 // 系统调用中断号: 0x80 52 #endif
2. 系统调用执行过程
在执行"int 0x80"汇编指令时, 便开始执行system_call: 首先, 切换到内核空间, 保护中断现场; 其次, 比较系统调用号是否在允许的范围内(宏NR_syscalls定义了最大的系统调用号), 若不在此范围内, 则跳转到syscall_badsys处执行; 若在此范围内, 则根据调用号获取sys_call_table中对应的服务程序地址, 调用该服务程序; 最后保存返回值, 恢复中断现场。
488 489 # system call handler stub 490 ENTRY(system_call) 491 RING0_INT_FRAME # can't unwind into user space anyway 492 ASM_CLAC 493 pushl_cfi %eax # save orig_eax 494 SAVE_ALL 495 GET_THREAD_INFO(%ebp) 496 # system call tracing in operation / emulation 497 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 498 jnz syscall_trace_entry 499 cmpl $(NR_syscalls), %eax # 功能号是否有效 500 jae syscall_badsys 501 syscall_call: 502 call *sys_call_table(,%eax,4) # 执行对应的服务程序 503 syscall_after_call: 504 movl %eax,PT_EAX(%esp) # store the return value 505 syscall_exit: 506 LOCKDEP_SYS_EXIT 507 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 508 # setting need_resched or sigpending 509 # between sampling and the iret 510 TRACE_IRQS_OFF 511 movl TI_flags(%ebp), %ecx 512 testl $_TIF_ALLWORK_MASK, %ecx # current->work 513 jne syscall_exit_work 514 515 restore_all: 516 TRACE_IRQS_IRET 517 restore_all_notrace: 518 #ifdef CONFIG_X86_ESPFIX32 519 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 520 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 521 # are returning to the kernel. 522 # See comments in process.c:copy_thread() for details. 523 movb PT_OLDSS(%esp), %ah 524 movb PT_CS(%esp), %al 525 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 526 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 527 CFI_REMEMBER_STATE 528 je ldt_ss # returning to user-space with LDT SS 529 #endif 530 restore_nocheck: 531 RESTORE_REGS 4 # skip orig_eax/error_code 532 irq_return: 533 INTERRUPT_RETURN 534 .section .fixup,"ax" 535 ENTRY(iret_exc) 536 pushl $0 # no error code 537 pushl $do_iret_error 538 jmp error_code 539 .previous 540 _ASM_EXTABLE(irq_return,iret_exc) 541 542 #ifdef CONFIG_X86_ESPFIX32 543 CFI_RESTORE_STATE 544 ldt_ss: 545 #ifdef CONFIG_PARAVIRT 546 /* 547 * The kernel can't run on a non-flat stack if paravirt mode 548 * is active. Rather than try to fixup the high bits of 549 * ESP, bypass this code entirely. This may break DOSemu 550 * and/or Wine support in a paravirt VM, although the option 551 * is still available to implement the setting of the high 552 * 16-bits in the INTERRUPT_RETURN paravirt-op. 553 */ 554 cmpl $0, pv_info+PARAVIRT_enabled 555 jne restore_nocheck 556 #endif 557
RING0_INT_FRAME宏定义:
256 257 .macro RING0_INT_FRAME 258 CFI_STARTPROC simple 259 CFI_SIGNAL_FRAME 260 CFI_DEF_CFA esp, 3*4 261 /*CFI_OFFSET cs, -2*4;*/ 262 CFI_OFFSET eip, -3*4 263 .endm 264
SAVE_ALL宏定义
186 .macro SAVE_ALL 187 cld 188 PUSH_GS 189 pushl_cfi %fs 190 /*CFI_REL_OFFSET fs, 0;*/ 191 pushl_cfi %es 192 /*CFI_REL_OFFSET es, 0;*/ 193 pushl_cfi %ds 194 /*CFI_REL_OFFSET ds, 0;*/ 195 pushl_cfi %eax 196 CFI_REL_OFFSET eax, 0 197 pushl_cfi %ebp 198 CFI_REL_OFFSET ebp, 0 199 pushl_cfi %edi 200 CFI_REL_OFFSET edi, 0 201 pushl_cfi %esi 202 CFI_REL_OFFSET esi, 0 203 pushl_cfi %edx 204 CFI_REL_OFFSET edx, 0 205 pushl_cfi %ecx 206 CFI_REL_OFFSET ecx, 0 207 pushl_cfi %ebx 208 CFI_REL_OFFSET ebx, 0 209 movl $(__USER_DS), %edx 210 movl %edx, %ds 211 movl %edx, %es 212 movl $(__KERNEL_PERCPU), %edx 213 movl %edx, %fs 214 SET_KERNEL_GS %edx 215 .endm
sys_call_table定义:
arch/x86/kernel/syscall_32.c: __visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; arch/x86/include/generated/asm/syscalls_32.h: __SYSCALL_I386(0, sys_restart_syscall, sys_restart_syscall) __SYSCALL_I386(1, sys_exit, sys_exit) __SYSCALL_I386(2, sys_fork, stub32_fork) __SYSCALL_I386(3, sys_read, sys_read) __SYSCALL_I386(4, sys_write, sys_write) __SYSCALL_I386(5, sys_open, compat_sys_open) __SYSCALL_I386(6, sys_close, sys_close) __SYSCALL_I386(7, sys_waitpid, sys32_waitpid) __SYSCALL_I386(8, sys_creat, sys_creat) __SYSCALL_I386(9, sys_link, sys_link) __SYSCALL_I386(10, sys_unlink, sys_unlink) __SYSCALL_I386(11, sys_execve, stub32_execve) __SYSCALL_I386(12, sys_chdir, sys_chdir) __SYSCALL_I386(13, sys_time, compat_sys_time) __SYSCALL_I386(14, sys_mknod, sys_mknod) __SYSCALL_I386(15, sys_chmod, sys_chmod) __SYSCALL_I386(16, sys_lchown16, sys_lchown16) __SYSCALL_I386(18, sys_stat, sys_stat) __SYSCALL_I386(19, sys_lseek, compat_sys_lseek) __SYSCALL_I386(20, sys_getpid, sys_getpid) ...
3. fork系统调用执行分析:
在执行系统调用指令之前, 我们先设置了系统调用的功能号"mov $0x02, %eax", 然后执行"int $0x80"。根据在trap_init()中设置的系统陷阱门, 得到中断号0x80对应的中断服务程序的入口地址是system_call, 系统开始执行system_call。先确保不会返回到用户空间, 保护中断现场, 确保系统调用的功能号有效, 并根据该功能号得到系统调用表sys_call_table中的偏移, 从而得到该功能号对应的服务程序入口地址, 即得到sys_fork。 之后, 便调用sys_fork完成fork进程的任务。最后, 保存返回值, 恢复现场。