ARM系统调用是通过SWI异常处理函数实现的,这里简要概述系统调用流程。
arch/arm/kernel/entry-armv.S中定义的vector_swi负责处理系统调用,代码如下
ENTRY(vector_swi) sub sp, sp, #S_FRAME_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 add r8, sp, #S_PC stmdb r8, {sp, lr}^ @ Calling sp, lr mrs r8, spsr @ called from non-FIQ mode, so ok. str lr, [sp, #S_PC] @ Save calling PC str r8, [sp, #S_PSR] @ Save CPSR str r0, [sp, #S_OLD_R0] @ Save OLD_R0 zero_fp /* * Get the system call number. */ #if defined(CONFIG_OABI_COMPAT) /* * If we have CONFIG_OABI_COMPAT then we need to look at the swi * value to determine if it is an EABI or an old ABI call. */ #ifdef CONFIG_ARM_THUMB tst r8, #PSR_T_BIT movne r10, #0 @ no thumb OABI emulation ldreq r10, [lr, #-4] @ get SWI instruction #else ldr r10, [lr, #-4] @ get SWI instruction A710( and ip, r10, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #endif #elif defined(CONFIG_AEABI) /* * Pure EABI user space always put syscall number into scno (r7). */ A710( ldr ip, [lr, #-4] @ get SWI instruction ) A710( and ip, ip, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #elif defined(CONFIG_ARM_THUMB) /* Legacy ABI only, possibly thumb mode. */ tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in ldreq scno, [lr, #-4] #else /* Legacy ABI only. */ ldr scno, [lr, #-4] @ get SWI instruction A710( and ip, scno, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #endif #ifdef CONFIG_ALIGNMENT_TRAP ldr ip, __cr_alignment ldr ip, [ip] mcr p15, 0, ip, c1, c0 @ update control register #endif enable_irq get_thread_info tsk adr tbl, sys_call_table @ load syscall table pointer ldr ip, [tsk, #TI_FLAGS] @ check for syscall tracing #if defined(CONFIG_OABI_COMPAT) /* * If the swi argument is zero, this is an EABI call and we do nothing. * * If this is an old ABI call, get the syscall number into scno and * get the old ABI syscall table address. */ bics r10, r10, #0xff000000 eorne scno, r10, #__NR_OABI_SYSCALL_BASE ldrne tbl, =sys_oabi_call_table #elif !defined(CONFIG_AEABI) bic scno, scno, #0xff000000 @ mask off SWI op-code eor scno, scno, #__NR_SYSCALL_BASE @ check OS number #endif stmdb sp!, {r4, r5} @ push fifth and sixth args tst ip, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? bne __sys_trace cmp scno, #NR_syscalls @ check upper syscall limit adr lr, ret_fast_syscall @ return address ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine add r1, sp, #S_OFF 2: mov why, #0 @ no longer a real syscall cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE) eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back bcs arm_syscall b sys_ni_syscall @ not private func ENDPROC(vector_swi)
1.第一次包含linux/arch/arm/kernel/calls.S,计算系统调用接口的个数,包存在NR_syscalls中,对齐到4的倍数。
/* * linux/arch/arm/kernel/calls.S * * Copyright (C) 1995-2005 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This file is included thrice in entry-common.S */ /* 0 */ CALL(sys_restart_syscall) CALL(sys_exit) CALL(sys_fork_wrapper) CALL(sys_read) CALL(sys_write) /* 5 */ CALL(sys_open) CALL(sys_close) CALL(sys_ni_syscall) /* was sys_waitpid */ CALL(sys_creat) CALL(sys_link) /* 10 */ CALL(sys_unlink) CALL(sys_execve_wrapper) CALL(sys_chdir) CALL(OBSOLETE(sys_time)) /* used by libc4 */ CALL(sys_mknod) /* 15 */ CALL(sys_chmod) CALL(sys_lchown16) CALL(sys_ni_syscall) /* was sys_break */ CALL(sys_ni_syscall) /* was sys_stat */ CALL(sys_lseek) /* 20 */ CALL(sys_getpid) CALL(sys_mount) CALL(OBSOLETE(sys_oldumount)) /* used by libc4 */ CALL(sys_setuid16) CALL(sys_getuid16) /* 25 */ CALL(OBSOLETE(sys_stime)) CALL(sys_ptrace) CALL(OBSOLETE(sys_alarm)) /* used by libc4 */ CALL(sys_ni_syscall) /* was sys_fstat */ CALL(sys_pause) /* 30 */ CALL(OBSOLETE(sys_utime)) /* used by libc4 */ CALL(sys_ni_syscall) /* was sys_stty */ CALL(sys_ni_syscall) /* was sys_getty */ CALL(sys_access) CALL(sys_nice) /* 35 */ CALL(sys_ni_syscall) /* was sys_ftime */ CALL(sys_sync) CALL(sys_kill) CALL(sys_rename) CALL(sys_mkdir) /* 40 */ CALL(sys_rmdir) CALL(sys_dup) CALL(sys_pipe) CALL(sys_times) CALL(sys_ni_syscall) /* was sys_prof */ /* 45 */ CALL(sys_brk) CALL(sys_setgid16) CALL(sys_getgid16) CALL(sys_ni_syscall) /* was sys_signal */ CALL(sys_geteuid16) /* 50 */ CALL(sys_getegid16) CALL(sys_acct) CALL(sys_umount) CALL(sys_ni_syscall) /* was sys_lock */ CALL(sys_ioctl) /* 55 */ CALL(sys_fcntl) CALL(sys_ni_syscall) /* was sys_mpx */ CALL(sys_setpgid) CALL(sys_ni_syscall) /* was sys_ulimit */ CALL(sys_ni_syscall) /* was sys_olduname */ /* 60 */ CALL(sys_umask) CALL(sys_chroot) CALL(sys_ustat) CALL(sys_dup2) CALL(sys_getppid) /* 65 */ CALL(sys_getpgrp) CALL(sys_setsid) CALL(sys_sigaction) CALL(sys_ni_syscall) /* was sys_sgetmask */ CALL(sys_ni_syscall) /* was sys_ssetmask */ /* 70 */ CALL(sys_setreuid16) CALL(sys_setregid16) CALL(sys_sigsuspend_wrapper) CALL(sys_sigpending) CALL(sys_sethostname) /* 75 */ CALL(sys_setrlimit) CALL(OBSOLETE(sys_old_getrlimit)) /* used by libc4 */ CALL(sys_getrusage) CALL(sys_gettimeofday) CALL(sys_settimeofday) /* 80 */ CALL(sys_getgroups16) CALL(sys_setgroups16) CALL(OBSOLETE(old_select)) /* used by libc4 */ CALL(sys_symlink) CALL(sys_ni_syscall) /* was sys_lstat */ /* 85 */ CALL(sys_readlink) CALL(sys_uselib) CALL(sys_swapon) CALL(sys_reboot) CALL(OBSOLETE(sys_old_readdir)) /* used by libc4 */ /* 90 */ CALL(OBSOLETE(old_mmap)) /* used by libc4 */ CALL(sys_munmap) CALL(sys_truncate) CALL(sys_ftruncate) CALL(sys_fchmod) /* 95 */ CALL(sys_fchown16) CALL(sys_getpriority) CALL(sys_setpriority) CALL(sys_ni_syscall) /* was sys_profil */ CALL(sys_statfs) /* 100 */ CALL(sys_fstatfs) CALL(sys_ni_syscall) /* sys_ioperm */ CALL(OBSOLETE(ABI(sys_socketcall, sys_oabi_socketcall))) CALL(sys_syslog) CALL(sys_setitimer) /* 105 */ CALL(sys_getitimer) CALL(sys_newstat) CALL(sys_newlstat) CALL(sys_newfstat) CALL(sys_ni_syscall) /* was sys_uname */ /* 110 */ CALL(sys_ni_syscall) /* was sys_iopl */ CALL(sys_vhangup) CALL(sys_ni_syscall) CALL(OBSOLETE(sys_syscall)) /* call a syscall */ CALL(sys_wait4) /* 115 */ CALL(sys_swapoff) CALL(sys_sysinfo) CALL(OBSOLETE(ABI(sys_ipc, sys_oabi_ipc))) CALL(sys_fsync) CALL(sys_sigreturn_wrapper) /* 120 */ CALL(sys_clone_wrapper) CALL(sys_setdomainname) CALL(sys_newuname) CALL(sys_ni_syscall) /* modify_ldt */ CALL(sys_adjtimex) /* 125 */ CALL(sys_mprotect) CALL(sys_sigprocmask) CALL(sys_ni_syscall) /* was sys_create_module */ CALL(sys_init_module) CALL(sys_delete_module) /* 130 */ CALL(sys_ni_syscall) /* was sys_get_kernel_syms */ CALL(sys_quotactl) CALL(sys_getpgid) CALL(sys_fchdir) CALL(sys_bdflush) /* 135 */ CALL(sys_sysfs) CALL(sys_personality) CALL(sys_ni_syscall) /* reserved for afs_syscall */ CALL(sys_setfsuid16) CALL(sys_setfsgid16) /* 140 */ CALL(sys_llseek) CALL(sys_getdents) CALL(sys_select) CALL(sys_flock) CALL(sys_msync) /* 145 */ CALL(sys_readv) CALL(sys_writev) CALL(sys_getsid) CALL(sys_fdatasync) CALL(sys_sysctl) /* 150 */ CALL(sys_mlock) CALL(sys_munlock) CALL(sys_mlockall) CALL(sys_munlockall) CALL(sys_sched_setparam) /* 155 */ CALL(sys_sched_getparam) CALL(sys_sched_setscheduler) CALL(sys_sched_getscheduler) CALL(sys_sched_yield) CALL(sys_sched_get_priority_max) /* 160 */ CALL(sys_sched_get_priority_min) CALL(sys_sched_rr_get_interval) CALL(sys_nanosleep) CALL(sys_arm_mremap) CALL(sys_setresuid16) /* 165 */ CALL(sys_getresuid16) CALL(sys_ni_syscall) /* vm86 */ CALL(sys_ni_syscall) /* was sys_query_module */ CALL(sys_poll) CALL(sys_nfsservctl) /* 170 */ CALL(sys_setresgid16) CALL(sys_getresgid16) CALL(sys_prctl) CALL(sys_rt_sigreturn_wrapper) CALL(sys_rt_sigaction) /* 175 */ CALL(sys_rt_sigprocmask) CALL(sys_rt_sigpending) CALL(sys_rt_sigtimedwait) CALL(sys_rt_sigqueueinfo) CALL(sys_rt_sigsuspend_wrapper) /* 180 */ CALL(ABI(sys_pread64, sys_oabi_pread64)) CALL(ABI(sys_pwrite64, sys_oabi_pwrite64)) CALL(sys_chown16) CALL(sys_getcwd) CALL(sys_capget) /* 185 */ CALL(sys_capset) CALL(sys_sigaltstack_wrapper) CALL(sys_sendfile) CALL(sys_ni_syscall) /* getpmsg */ CALL(sys_ni_syscall) /* putpmsg */ /* 190 */ CALL(sys_vfork_wrapper) CALL(sys_getrlimit) CALL(sys_mmap2) CALL(ABI(sys_truncate64, sys_oabi_truncate64)) CALL(ABI(sys_ftruncate64, sys_oabi_ftruncate64)) /* 195 */ CALL(ABI(sys_stat64, sys_oabi_stat64)) CALL(ABI(sys_lstat64, sys_oabi_lstat64)) CALL(ABI(sys_fstat64, sys_oabi_fstat64)) CALL(sys_lchown) CALL(sys_getuid) /* 200 */ CALL(sys_getgid) CALL(sys_geteuid) CALL(sys_getegid) CALL(sys_setreuid) CALL(sys_setregid) /* 205 */ CALL(sys_getgroups) CALL(sys_setgroups) CALL(sys_fchown) CALL(sys_setresuid) CALL(sys_getresuid) /* 210 */ CALL(sys_setresgid) CALL(sys_getresgid) CALL(sys_chown) CALL(sys_setuid) CALL(sys_setgid) /* 215 */ CALL(sys_setfsuid) CALL(sys_setfsgid) CALL(sys_getdents64) CALL(sys_pivot_root) CALL(sys_mincore) /* 220 */ CALL(sys_madvise) CALL(ABI(sys_fcntl64, sys_oabi_fcntl64)) CALL(sys_ni_syscall) /* TUX */ CALL(sys_ni_syscall) CALL(sys_gettid) /* 225 */ CALL(ABI(sys_readahead, sys_oabi_readahead)) CALL(sys_setxattr) CALL(sys_lsetxattr) CALL(sys_fsetxattr) CALL(sys_getxattr) /* 230 */ CALL(sys_lgetxattr) CALL(sys_fgetxattr) CALL(sys_listxattr) CALL(sys_llistxattr) CALL(sys_flistxattr) /* 235 */ CALL(sys_removexattr) CALL(sys_lremovexattr) CALL(sys_fremovexattr) CALL(sys_tkill) CALL(sys_sendfile64) /* 240 */ CALL(sys_futex) CALL(sys_sched_setaffinity) CALL(sys_sched_getaffinity) CALL(sys_io_setup) CALL(sys_io_destroy) /* 245 */ CALL(sys_io_getevents) CALL(sys_io_submit) CALL(sys_io_cancel) CALL(sys_exit_group) CALL(sys_lookup_dcookie) /* 250 */ CALL(sys_epoll_create) CALL(ABI(sys_epoll_ctl, sys_oabi_epoll_ctl)) CALL(ABI(sys_epoll_wait, sys_oabi_epoll_wait)) CALL(sys_remap_file_pages) CALL(sys_ni_syscall) /* sys_set_thread_area */ /* 255 */ CALL(sys_ni_syscall) /* sys_get_thread_area */ CALL(sys_set_tid_address) CALL(sys_timer_create) CALL(sys_timer_settime) CALL(sys_timer_gettime) /* 260 */ CALL(sys_timer_getoverrun) CALL(sys_timer_delete) CALL(sys_clock_settime) CALL(sys_clock_gettime) CALL(sys_clock_getres) /* 265 */ CALL(sys_clock_nanosleep) CALL(sys_statfs64_wrapper) CALL(sys_fstatfs64_wrapper) CALL(sys_tgkill) CALL(sys_utimes) /* 270 */ CALL(sys_arm_fadvise64_64) CALL(sys_pciconfig_iobase) CALL(sys_pciconfig_read) CALL(sys_pciconfig_write) CALL(sys_mq_open) /* 275 */ CALL(sys_mq_unlink) CALL(sys_mq_timedsend) CALL(sys_mq_timedreceive) CALL(sys_mq_notify) CALL(sys_mq_getsetattr) /* 280 */ CALL(sys_waitid) CALL(sys_socket) CALL(ABI(sys_bind, sys_oabi_bind)) CALL(ABI(sys_connect, sys_oabi_connect)) CALL(sys_listen) /* 285 */ CALL(sys_accept) CALL(sys_getsockname) CALL(sys_getpeername) CALL(sys_socketpair) CALL(sys_send) /* 290 */ CALL(ABI(sys_sendto, sys_oabi_sendto)) CALL(sys_recv) CALL(sys_recvfrom) CALL(sys_shutdown) CALL(sys_setsockopt) /* 295 */ CALL(sys_getsockopt) CALL(ABI(sys_sendmsg, sys_oabi_sendmsg)) CALL(sys_recvmsg) CALL(ABI(sys_semop, sys_oabi_semop)) CALL(sys_semget) /* 300 */ CALL(sys_semctl) CALL(sys_msgsnd) CALL(sys_msgrcv) CALL(sys_msgget) CALL(sys_msgctl) /* 305 */ CALL(sys_shmat) CALL(sys_shmdt) CALL(sys_shmget) CALL(sys_shmctl) CALL(sys_add_key) /* 310 */ CALL(sys_request_key) CALL(sys_keyctl) CALL(ABI(sys_semtimedop, sys_oabi_semtimedop)) /* vserver */ CALL(sys_ni_syscall) CALL(sys_ioprio_set) /* 315 */ CALL(sys_ioprio_get) CALL(sys_inotify_init) CALL(sys_inotify_add_watch) CALL(sys_inotify_rm_watch) CALL(sys_mbind) /* 320 */ CALL(sys_get_mempolicy) CALL(sys_set_mempolicy) CALL(sys_openat) CALL(sys_mkdirat) CALL(sys_mknodat) /* 325 */ CALL(sys_fchownat) CALL(sys_futimesat) CALL(ABI(sys_fstatat64, sys_oabi_fstatat64)) CALL(sys_unlinkat) CALL(sys_renameat) /* 330 */ CALL(sys_linkat) CALL(sys_symlinkat) CALL(sys_readlinkat) CALL(sys_fchmodat) CALL(sys_faccessat) /* 335 */ CALL(sys_ni_syscall) /* eventually pselect6 */ CALL(sys_ni_syscall) /* eventually ppoll */ CALL(sys_unshare) CALL(sys_set_robust_list) CALL(sys_get_robust_list) /* 340 */ CALL(sys_splice) CALL(sys_sync_file_range2) CALL(sys_tee) CALL(sys_vmsplice) CALL(sys_move_pages) /* 345 */ CALL(sys_getcpu) CALL(sys_ni_syscall) /* eventually epoll_pwait */ CALL(sys_kexec_load) CALL(sys_utimensat) CALL(sys_signalfd) /* 350 */ CALL(sys_timerfd_create) CALL(sys_eventfd) CALL(sys_fallocate) CALL(sys_timerfd_settime) CALL(sys_timerfd_gettime) /* 355 */ CALL(sys_signalfd4) CALL(sys_eventfd2) CALL(sys_epoll_create1) CALL(sys_dup3) CALL(sys_pipe2) /* 360 */ CALL(sys_inotify_init1) CALL(sys_preadv) CALL(sys_pwritev) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted #endif .rept syscalls_padding CALL(sys_ni_syscall) .endr
in arch/arm/kernel/entry-common.S line102--106
.equ NR_syscalls,0 #define CALL(x) .equ NR_syscalls,NR_syscalls+1 #include "calls.S" #undef CALL #define CALL(x) .long x
2.第2次包含linux/arch/arm/kernel/calls.S,基于#define CALL(x) .long x。
in arch/arm/kernel/entry-common.S line328--330
.type sys_call_table, #object ENTRY(sys_call_table) #include "calls.S"
在linux/arch/arm/kernel/entry_header.S中定义了几个特殊的寄存器变量,scno 用r7存储系统调用号 tbl 用r8存储系统调用表入口。
scno .req r7 @ syscall number tbl .req r8 @ syscall table pointer why .req r8 @ Linux syscall (!= 0) tsk .req r9 @ current thread_info
swi异常处理函数中调用,tbl即r8中保存系统调用表的入口地址,
ENTRY(vector_swi)
-->adr tbl, sys_call_table @ load syscall table pointer
sys_call_table即系统调用表入口地址,定义如下
.type sys_call_table, #object ENTRY(sys_call_table) #include "calls.S" /* 此处calls.S展开如下: .long sys_restart_syscall .long sys_exit .long sys_fork_wrapper .long sys_read .long sys_write .long sys_open ...... */ #undef ABI #undef OBSOLETE
3.第3次包含linux/arch/arm/kernel/calls.S,基于#define CALL(x) .long x。构造sys_oabi_call_table
in arch/arm/kernel/entry-common.S line477--479
.type sys_oabi_call_table, #object ENTRY(sys_oabi_call_table) #include "calls.S"
4.最终的系统调用
ENTRY(vector_swi) -->ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine
最后的问题是系统调用表中的系统调用函数怎么定义的?这里以fs/open.c中的sys_open为例,概述系统调用函数的定义
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_sys_open(AT_FDCWD, filename, flags, mode); /* avoid REGPARM breakage on x86: */ asmlinkage_protect(3, ret, filename, flags, mode); return ret; }
SYSCALL_DEFINES宏定义如下:
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) /*include/linux/syscalls.h*/ #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define SYSCALL_DEFINEx(x, sname, ...) __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #define __SC_DECL1(t1, a1) t1 a1 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) #define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__) #define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__) #define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__) #define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__)
按照以上的宏定义展开,最终为
asmlinkage long sys_open(const char __user * filename,int flags,int mode) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_sys_open(AT_FDCWD, filename, flags, mode); /* avoid REGPARM breakage on x86: */ asmlinkage_protect(3, ret, filename, flags, mode); return ret; }
5.简化的系统调用流程
.align 5 ENTRY(vector_swi) /*1.入栈*/ sub sp, sp, #S_FRAME_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 add r8, sp, #S_PC stmdb r8, {sp, lr}^ @ Calling sp, lr mrs r8, spsr @ called from non-FIQ mode, so ok. str lr, [sp, #S_PC] @ Save calling PC str r8, [sp, #S_PSR] @ Save CPSR str r0, [sp, #S_OLD_R0] @ Save OLD_R0 zero_fp /*2.Get the system call number. */ ldreq r10, [lr, #-4] @ get SWI instruction enable_irq /* * If the swi argument is zero, this is an EABI call and we do nothing. * * If this is an old ABI call, get the syscall number into scno and * get the old ABI syscall table address. */ bics r10, r10, #0xff000000 eorne scno, r10, #__NR_OABI_SYSCALL_BASE // /*3.系统调用函数指针数组*/ ldrne tbl, =sys_oabi_call_table stmdb sp!, {r4, r5} @ push fifth and sixth args cmp scno, #NR_syscalls @ check upper syscall limit /*swi异常处理返回*/ adr lr, ret_fast_syscall @ return address /*4.跳转到具体的系统调用函数*/ ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine ENDPROC(vector_swi)
6.swi异常返回
in arch/arm/kernel/entry-common.S
ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq @ disable interrupts ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @ fast_restore_user_regs ldr r1, [sp, #S_OFF + S_PSR] @ get calling cpsr ldr lr, [sp, #S_OFF + S_PC]! @ get pc msr spsr_cxsf, r1 @ save in spsr_svc ldmdb sp, {r1 - lr}^ @ get calling r1 - lr mov r0, r0 add sp, sp, #S_FRAME_SIZE - S_PC movs pc, lr @ return & move spsr_svc into cpsr UNWIND(.fnend )