内核创建进行所进行的工作,本文阅读的内核代码为Linux kernel 2.6。
进程创建的大部分工作由do_fork这个函数完成,函数原型如下:
long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr)
1、首先进行一些参数及权限的检查。
if (clone_flags & CLONE_NEWUSER) { if (clone_flags & CLONE_THREAD) return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID)) return -EPERM; }
2、进行状态的检查。这里主要是进行进程停止状态的检查。
if (unlikely(clone_flags & CLONE_STOPPED)) { static int __read_mostly count = 100; if (count > 0 && printk_ratelimit()) { char comm[TASK_COMM_LEN]; count--; printk(KERN_INFO "fork(): process `%s' used deprecated " "clone flags 0x%lx ", get_task_comm(comm, current), clone_flags & CLONE_STOPPED); } }
3、用户空间检查,下面这段代码比较有用。主要是进行准备复制准备工作然后复制当前进程。
/* * When called from kernel_thread, don't do user tracing stuff. */
if (likely(user_mode(regs))) trace = tracehook_prepare_clone(clone_flags);
p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace);
首先是用户空间压栈操作,保存寄存器。其中regs是这么一个参数。
struct pt_regs {
unsigned long long pc;
unsigned long long sr;
long long syscall_nr;
unsigned long long regs[63];
unsigned long long tregs[8];
unsigned long long pad[2];
};
从结构体中的成员可以看到,包含
pc:程序计数器
sr:scratch寄存器
syscall_nr:系统调用
总之,这里是包含进程在退出cpu时所需的最小信息。
p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace);
这里即复制一个进程。返回值为task_struct的结构体,该结构描述了一个进程的基本状态。这里并不进行详细的介绍。
4,、对创建的进程进行一些错误检查。这里发生的可能性不大,可以暂时先不管,把握住我们的主线。
if (!IS_ERR(p)) { struct completion vfork;
trace_sched_process_fork(current, p);
nr = task_pid_vnr(p);
if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr);
if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork);
}
5、然后接着下面两个函数。
audit_finish_fork(p);//主要是检查完成的进程的状态。
tracehook_report_clone(regs, clone_flags, nr, p);//主要是阻塞刚刚创建的子进程,因为现在还是在父进程进程中,子进程并未开始执行,暂时挂起子进程。下面为函数解释。
/**
* tracehook_report_clone - in parent, new child is about to start running
* @regs: parent's user register state
* @clone_flags: flags from parent's system call
* @pid: new child's PID in the parent's namespace
* @child: new child task
*
* Called after a child is set up, but before it has been started running.
* This is not a good place to block, because the child has not started
* yet. Suspend the child here if desired, and then block in
* tracehook_report_clone_complete(). This must prevent the child from
* self-reaping if tracehook_report_clone_complete() uses the @child
* pointer; otherwise it might have died and been released by the time
* tracehook_report_clone_complete() is called.
*
* Called with no locks held, but the child cannot run until this returns.
*/
6、设置进程标志位。
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
* hasn't gotten to tracehook_report_clone() yet. Now we
* clear it and set the child going.
*/
p->flags &= ~PF_STARTING;
(PF_STARTING宏定义解释为:#define PF_STARTING 0x00000002 /* being created */,表明该进程已创建)
7、唤醒进程。这里先判断复制标志是否为 CLONE_STOPPED状态,但是大多数情形下,并不为CLONE_STOPPED状态。
if (unlikely(clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
__set_task_state(p, TASK_STOPPED);
} else {
wake_up_new_task(p, clone_flags);
}
tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);/*这个函数主要是报告当前之前阻塞的子进程已经开始运行*/
下面是wake_up_new_task函数功能解释。主要功能是首次唤醒创建的进程,同时完成一些初始化调度的所需的工作,并将进程放入运行队列中。
关于一个进程如何添加到队列中去,可以从这里进行研究。本文先不对此进行研究,还是放在进程的创建上来。
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
8、检查clone_flags标志位。若当前标志位为 CLONE_VFORK(#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */),该标志位表明父进程想在内存释放后唤醒, wait_for_completion(&vfork);中有个自旋锁的操作,主要是等待由用户空间返回内核空间。
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
tracehook_report_vfork_done(p, nr);
}
9、完成所有操作,返回。其中返回值为新的进程的pid。
} else {
nr = PTR_ERR(p);
}
return nr;
10、下面是完整的程序。
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; long nr; /* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ if (clone_flags & CLONE_NEWUSER) { if (clone_flags & CLONE_THREAD) return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID)) return -EPERM; } /* * We hope to recycle these flags after 2.6.26 */ if (unlikely(clone_flags & CLONE_STOPPED)) { static int __read_mostly count = 100; if (count > 0 && printk_ratelimit()) { char comm[TASK_COMM_LEN]; count--; printk(KERN_INFO "fork(): process `%s' used deprecated " "clone flags 0x%lx ", get_task_comm(comm, current), clone_flags & CLONE_STOPPED); } } /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that * hasn't gotten to tracehook_report_clone() yet. Now we * clear it and set the child going. */ p->flags &= ~PF_STARTING; if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); __set_task_state(p, TASK_STOPPED); } else { wake_up_new_task(p, clone_flags); } tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); } return nr; }