zoukankan      html  css  js  c++  java
  • linux-2.6.18源码分析笔记---进程

    一、进程重要字段描述

    在目录includelinuxsched.h下定义了进程描述符task_struct,关注如下字段:

    进程状态

    volatile long state:表示进程状态,该文件头部有几种状态的取值。

    long exit_state:表示进程退出状态,下面的定义中前缀为EXIT的表示该字段取值,表示进程退出状态

    /*
     * Task state bitmask. NOTE! These bits are also
     * encoded in fs/proc/array.c: get_task_state().
     *
     * We have two separate sets of flags: task->state
     * is about runnability, while task->exit_state are
     * about the task exiting. Confusing, but this way
     * modifying one set can't modify the other one by
     * mistake.
     */
    #define TASK_RUNNING        0
    #define TASK_INTERRUPTIBLE    1
    #define TASK_UNINTERRUPTIBLE    2
    #define TASK_STOPPED        4
    #define TASK_TRACED        8
    /* in tsk->exit_state */
    #define EXIT_ZOMBIE        16
    #define EXIT_DEAD        32
    /* in tsk->state again */
    #define TASK_NONINTERACTIVE    64
    
    #define __set_task_state(tsk, state_value)        
        do { (tsk)->state = (state_value); } while (0)
    #define set_task_state(tsk, state_value)        
        set_mb((tsk)->state, (state_value))
    
    /*
     * set_current_state() includes a barrier so that the write of current->state
     * is correctly serialised wrt the caller's subsequent test of whether to
     * actually sleep:
     *
     *    set_current_state(TASK_UNINTERRUPTIBLE);
     *    if (do_i_need_to_sleep())
     *        schedule();
     *
     * If the caller does not need such serialisation then use __set_current_state()
     */
    #define __set_current_state(state_value)            
        do { current->state = (state_value); } while (0)
    #define set_current_state(state_value)        
        set_mb(current->state, (state_value))
    statue取值和操作

    标志一个进程

    pid_t pid;
    pid_t tgid;

    用process id,即pid来标志一个进程。POSIX规定一个线程组中所有线程都必须有同一个pid(线程在linux中也被叫做轻量级进程),在linux中所有线程使用该线程组的领头线程(thread group leader)的pid,存放在tgid之中。getpid系统调用返回的是当前进程的tgid而不是pid

    线程描述符

    struct thread_info *thread_info;

    该结构和进程的栈存放在两个连续的页框之内,thread_info放在栈增长方向的顶部,可以根据esp指针找到thread_info(如果栈是8K,那么只需要屏蔽掉esp的后13位),同时根据thread_info中的进程描述符可以快速找到进程的进程描述符。

    thread_info定义在在includeasm-i386 hread_info.h之中,该文件还定义了一些对于thread_info的操作函数。

    struct thread_info {
        struct task_struct    *task;        /* main task structure */
        struct exec_domain    *exec_domain;    /* execution domain */
        unsigned long        flags;        /* low level flags */
        unsigned long        status;        /* thread-synchronous flags */
        __u32            cpu;        /* current CPU */
        int            preempt_count;    /* 0 => preemptable, <0 => BUG */
    
    
        mm_segment_t        addr_limit;    /* thread address space:
                                0-0xBFFFFFFF for user-thead
                               0-0xFFFFFFFF for kernel-thread
                            */
        void            *sysenter_return;
        struct restart_block    restart_block;
    
        unsigned long           previous_esp;   /* ESP of the previous stack in case
                               of nested (IRQ) stacks
                            */
        __u8            supervisor_stack[0];
    };
    thread_info

    在includelinuxsched.h还定义了thread_union,用来方便的描述线程描述符和内核栈

    union thread_union {
        struct thread_info thread_info;
        unsigned long stack[THREAD_SIZE/sizeof(long)];
    };
    thread_union

    进程链表

    struct list_head tasks; 连接所有的进程描述符

    在includelinuxlist.h之中定义了linux的两种链表list_head和hlist_head、hlist_node和相关操作。再次不赘述,看task_struct中部分链表

    在includelinuxsched.h还定义了用来遍历整个进程描述符的宏for_each_process,初始进程init_task

    运行进程的链表

    struct list_head run_list;

    这个字段在调度的时候使用,当寻找下一个可运行的进程的时候就使用这个字段

    进程间的关系

    父子关系涉及到如下4个字段:

    struct task_struct *real_parent;  父进程
    struct task_struct *parent;  在发出ptrace调用时和real_parent不一致
    struct list_head children;  子进程链表的头
    struct list_head sibling;  子进程链表的下一个子进程

    而进程间有非亲属关系,涉及的字段如下:

    struct task_struct *group_leader;  指向线程组的进程描述符

    struct list_head thread_group;  线程组的链表

    pid_t signal->pgrp;  所在线程组领头线程的PID

    pid_t signal->session;  会话领头进程的pid

    struct list_head ptrace_children;  跟踪子进程的链表头

    struct list_head ptrace_list;  跟踪子进程的链表节点

    进程的等待队列

    在includelinuxwait.h中定义了两个结构__wait_queue_head和__wait_queue,分别表示等待队列的头和等待队列的节点

    struct __wait_queue_head {
        spinlock_t lock;
        struct list_head task_list;
    };
    __wait_queue_head
    struct __wait_queue {
        unsigned int flags;
    #define WQ_FLAG_EXCLUSIVE    0x01
        void *private;
        wait_queue_func_t func;
        struct list_head task_list;
    };
    __wait_queue

     __wait_queue_head中包含一个自旋锁和一个指针

    __wait_queue中第一个参数flags表示互斥(进程等待互斥的访问同一资源)还是非互斥进程(比如等待磁盘传输结束的所有进程),第二个参数为void指针,指向的应该是下一个进程的进程描述符(书上这一部分写的是task_struct指针,不清楚到这一版本怎么改成了这个),func字段表示的是如何唤醒进程(类型定义也在该文件之中),task_list字段把等待相同事件的进程串联起来

    该文件中还定义了很多等待队列的操作,具体看文件,不再赘述

    进程资源限制

    在进程描述符中有一个信号描述符

    struct signal_struct *signal;

    该结构同样定义在本文件中,在signal_struct中有一个如下字段,在信号描述符中似乎还定义了很多和信号无关的数据,这里的资源限制,上面的一些pid值,我还不是很清楚为什么要这么做,是应为锁的缘故吗?信号描述符好像是没有锁的,而锁在signal_handle之中?

    struct rlimit rlim[RLIM_NLIMITS];

    struct rlimit定义在includelinux esource.h之中,两个字段分别表示当前资源数和最大资源数

    struct rlimit {
        unsigned long    rlim_cur;
        unsigned long    rlim_max;
    };
    rlimit

    在includeasm-generic esource.h之中的定义了很多的以RLIMIT开头宏,用来访问各种资源的限制

    进程所属用户信息

    在includelinuxsched.h之中,定义了一个user_struct的结构。一个用户不止拥有一个进程,多个进程间可以通过user_struct结构来共享用户的信息

    struct user_struct {
        atomic_t __count;    /* reference count */
        atomic_t processes;    /* How many processes does this user have? */
        atomic_t files;        /* How many open files does this user have? */
        atomic_t sigpending;    /* How many pending signals does this user have? */
    #ifdef CONFIG_INOTIFY_USER
        atomic_t inotify_watches; /* How many inotify watches does this user have? */
        atomic_t inotify_devs;    /* How many inotify devs does this user have opened? */
    #endif
        /* protected by mq_lock    */
        unsigned long mq_bytes;    /* How many bytes can be allocated to mqueue? */
        unsigned long locked_shm; /* How many pages of mlocked shm ? */
    
    #ifdef CONFIG_KEYS
        struct key *uid_keyring;    /* UID specific keyring */
        struct key *session_keyring;    /* UID's default session keyring */
    #endif
    
        /* Hash table maintenance information */
        struct list_head uidhash_list;
        uid_t uid;
    };
    user_struct

    进程切换时硬件上下文的保存

    不是如intel所设计的那样,为每个进程设置了tss字段,linux为一个cpu保留一个tss段,当发生异常和中断的时候,会根据tss中字段做一些操作,操作系统将硬件上下文保存在进程描述符的一个字段中,当进程被切换上cpu时,使用下面字段的部分值去更改tss段。

    struct thread_struct thread;

    thread_struct定义在includeasm-i386processor.h之中

    struct thread_struct {
    /* cached TLS descriptors. */
        struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
        unsigned long    esp0;
        unsigned long    sysenter_cs;
        unsigned long    eip;
        unsigned long    esp;
        unsigned long    fs;
        unsigned long    gs;
    /* Hardware debugging registers */
        unsigned long    debugreg[8];  /* %%db0-7 debug registers */
    /* fault info */
        unsigned long    cr2, trap_no, error_code;
    /* floating point info */
        union i387_union    i387;
    /* virtual 86 mode info */
        struct vm86_struct __user * vm86_info;
        unsigned long        screen_bitmap;
        unsigned long        v86flags, v86mask, saved_esp0;
        unsigned int        saved_fs, saved_gs;
    /* IO permissions */
        unsigned long    *io_bitmap_ptr;
         unsigned long    iopl;
    /* max allowed port in the bitmap, in bytes: */
        unsigned long    io_bitmap_max;
    };
    thread_struct

    二、重要流程描述

    2.1 fork、vfork、clone系统调用,创建一个子进程

    fork、vfork、clone调用的系统服务例程在archi386process.c之中,该文件还包括进程的其他系统调用的服务例程如exec等。

    asmlinkage int sys_fork(struct pt_regs regs)
    {
        return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
    }
    sys_fork
    asmlinkage int sys_clone(struct pt_regs regs)
    {
        unsigned long clone_flags;
        unsigned long newsp;
        int __user *parent_tidptr, *child_tidptr;
    
        clone_flags = regs.ebx;
        newsp = regs.ecx;
        parent_tidptr = (int __user *)regs.edx;
        child_tidptr = (int __user *)regs.edi;
        if (!newsp)
            newsp = regs.esp;
        return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
    }
    sys_clone
    asmlinkage int sys_vfork(struct pt_regs regs)
    {
        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
    }
    sys_vfork

    其调用的do_fork在kernelfork.c之中。

    long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
    {
        struct task_struct *p;
        int trace = 0;
        struct pid *pid = alloc_pid();
        long nr;
    
        if (!pid)
            return -EAGAIN;
        nr = pid->nr;
        if (unlikely(current->ptrace)) {
            trace = fork_traceflag (clone_flags);
            if (trace)
                clone_flags |= CLONE_PTRACE;
        }
    
        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        if (!IS_ERR(p)) {
            struct completion vfork;
    
            if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
            }
    
            if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
                /*
                 * We'll start up with an immediate SIGSTOP.
                 */
                sigaddset(&p->pending.signal, SIGSTOP);
                set_tsk_thread_flag(p, TIF_SIGPENDING);
            }
    
            if (!(clone_flags & CLONE_STOPPED))
                wake_up_new_task(p, clone_flags);
            else
                p->state = TASK_STOPPED;
    
            if (unlikely (trace)) {
                current->ptrace_message = nr;
                ptrace_notify ((trace << 8) | SIGTRAP);
            }
    
            if (clone_flags & CLONE_VFORK) {
                wait_for_completion(&vfork);
                if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
                    current->ptrace_message = nr;
                    ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
                }
            }
        } else {
            free_pid(pid);
            nr = PTR_ERR(p);
        }
        return nr;
    }
    do_fork

    do_fork有6个参数,含义如下:

    1、clone_flags:各种信息,分为两部分,最低的字节为信号类型,用来规定子进程去世时应该向父进程发出的信号,高位是在includelinuxsched.h之中定义了很多以CLONE_开头的宏,代表不同的含义(具体见文件内,这里不再描述)。在sys_fork调用do_fork时使用了信号SIGCHLD,该信号是子进程停止、结束、或是在被跟踪时获得。

    2、stack_start:将父进程用户态栈的地址传入

    3、regs:寄存器值

    4、stack_size:栈大小

    5、parent_tidptr:创建子进程后,将子进程的pid写到该地址,该指针指向父进程的一块地址

    6、child_tidptr:

    接着分析do_fork所做的工作

    a、调用了alloc_pid函数分配了一个pid。(接下来需要具体分析各个pid的情况,linux内核设计与实现进程篇里有一点信息,见kernelpid.c文件)

    b、查看进程的ptrace字段,如果置位表示有进程想跟踪子进程,则置位clone_flags的CLONE_PTRACE位。(查看fork_traceflag函数可知跟踪子进程对vfork、fork、clone都做了区分,还涉及到SIGCHLD的判断)

    c、调用了copy_process函数复制进程,返回进程的进程描述符。copy_process是一个比较复杂的函数,下面单独看看。

    d、接着使用IS_ERR宏判断copy_process返回的指针是否正确(是否位于最后一个页),该宏定义在includelinuxerr.h中,包括下面用到的PTR_ERR宏(将指针转成错误号),用来处理指针错误,可直接搜索这个字段。

    e、如果必须要跟踪子进程,即设置了PT_PTRACED,或者设置了CLONE_STOPPED,则给子进程增加一个SIGSTOP信号,并设置信号标志位

    f、如果没有设置CLONE_STOPPED,则调用wake_up_new_task直接唤醒新的进程,函数实现在kernelsched.c之中,

    g、如果设置了CLONE_STOPPED,则进程状态设置为TASK_STOPPED

    h、如果设置了trace,则把子进程的pid放入父进程的ptrace_message,并调用ptrace_notify,这是信号部分的函数,在kernelsignal.c中,它使当前进程停止运行,向当前进程的父进程,也就是debugger进程发送SIGTRAP信号,并且可以在ptrace_message中找到子进程的pid。

    i、如果设置了CLONE_VFORK,则让父进程挂起,直到子进程结束。接下来这段代码应该停下来了,直到子进程执行完下面的代码才会被执行,下面的内容还是执行ptrace_notify函数。

    2.2 再看fork、vfork、clone系统调用的区别

    clone系统调用是功能最为强大的一个,他调用的do_fork函数所有参数都来自于调用者传入,可以实现任何程度上的进程复制。

    fork系统调用无参数,它的clone_flags被置为空,值规定了子进程退出时向父进程发送SIGCHLD信号。同时还有写时复制的原因在里面。

    vfork系统调用则在fork系统调用的基础上加了CLONE_VFORK和CLONE_VM位,CLONE_VM位表示共享内存描述符和所有页表,即共享了所有的地址空间,同时CLONE_VFORK表示子进程运行时让父进程挂起,直到子进程结束。在网上看到vfork系统调用出来的子进程不能使用return,也不能使用exit,但能使用_exit,不知道为什么,还需要思考一下。是因为子进程使用return和exit之后会释放掉父进程的栈空间,导致父进程不能继续执行了吗?也就是说子进程和父进程的执行步骤还是不一样的,到底有哪些事件一样,哪些事件不一样还需要好好看一看exit和_exit的区别。

    2.3 copy_process函数复制进程

    /*
     * This creates a new process as a copy of the old one,
     * but does not actually start it yet.
     *
     * It copies the registers, and all the appropriate
     * parts of the process environment (as per the clone
     * flags). The actual kick-off is left to the caller.
     */
    static struct task_struct *copy_process(unsigned long clone_flags,
                        unsigned long stack_start,
                        struct pt_regs *regs,
                        unsigned long stack_size,
                        int __user *parent_tidptr,
                        int __user *child_tidptr,
                        int pid)
    {
        int retval;
        struct task_struct *p = NULL;
    
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
            return ERR_PTR(-EINVAL);
    
        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
            return ERR_PTR(-EINVAL);
    
        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
            return ERR_PTR(-EINVAL);
    
        retval = security_task_create(clone_flags);
        if (retval)
            goto fork_out;
    
        retval = -ENOMEM;
        p = dup_task_struct(current);
        if (!p)
            goto fork_out;
    
    #ifdef CONFIG_TRACE_IRQFLAGS
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
    #endif
        retval = -EAGAIN;
        if (atomic_read(&p->user->processes) >=
                p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
            if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->user != &root_user)
                goto bad_fork_free;
        }
    
        atomic_inc(&p->user->__count);
        atomic_inc(&p->user->processes);
        get_group_info(p->group_info);
    
        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        if (nr_threads >= max_threads)
            goto bad_fork_cleanup_count;
    
        if (!try_module_get(task_thread_info(p)->exec_domain->module))
            goto bad_fork_cleanup_count;
    
        if (p->binfmt && !try_module_get(p->binfmt->module))
            goto bad_fork_cleanup_put_domain;
    
        p->did_exec = 0;
        delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p);
        p->pid = pid;
        retval = -EFAULT;
        if (clone_flags & CLONE_PARENT_SETTID)
            if (put_user(p->pid, parent_tidptr))
                goto bad_fork_cleanup_delays_binfmt;
    
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);
    
        clear_tsk_thread_flag(p, TIF_SIGPENDING);
        init_sigpending(&p->pending);
    
        p->utime = cputime_zero;
        p->stime = cputime_zero;
         p->sched_time = 0;
        p->rchar = 0;        /* I/O counter: bytes read */
        p->wchar = 0;        /* I/O counter: bytes written */
        p->syscr = 0;        /* I/O counter: read syscalls */
        p->syscw = 0;        /* I/O counter: write syscalls */
        acct_clear_integrals(p);
    
         p->it_virt_expires = cputime_zero;
        p->it_prof_expires = cputime_zero;
         p->it_sched_expires = 0;
         INIT_LIST_HEAD(&p->cpu_timers[0]);
         INIT_LIST_HEAD(&p->cpu_timers[1]);
         INIT_LIST_HEAD(&p->cpu_timers[2]);
    
        p->lock_depth = -1;        /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->security = NULL;
        p->io_context = NULL;
        p->io_wait = NULL;
        p->audit_context = NULL;
        cpuset_fork(p);
    #ifdef CONFIG_NUMA
         p->mempolicy = mpol_copy(p->mempolicy);
         if (IS_ERR(p->mempolicy)) {
             retval = PTR_ERR(p->mempolicy);
             p->mempolicy = NULL;
             goto bad_fork_cleanup_cpuset;
         }
        mpol_fix_fork_child_flag(p);
    #endif
    #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
        p->hardirqs_enabled = 0;
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
        p->hardirq_disable_event = 0;
        p->softirqs_enabled = 1;
        p->softirq_enable_ip = _THIS_IP_;
        p->softirq_enable_event = 0;
        p->softirq_disable_ip = 0;
        p->softirq_disable_event = 0;
        p->hardirq_context = 0;
        p->softirq_context = 0;
    #endif
    #ifdef CONFIG_LOCKDEP
        p->lockdep_depth = 0; /* no locks held yet */
        p->curr_chain_key = 0;
        p->lockdep_recursion = 0;
    #endif
    
        rt_mutex_init_task(p);
    
    #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
    #endif
    
        p->tgid = p->pid;
        if (clone_flags & CLONE_THREAD)
            p->tgid = current->tgid;
    
        if ((retval = security_task_alloc(p)))
            goto bad_fork_cleanup_policy;
        if ((retval = audit_alloc(p)))
            goto bad_fork_cleanup_security;
        /* copy all the process information */
        if ((retval = copy_semundo(clone_flags, p)))
            goto bad_fork_cleanup_audit;
        if ((retval = copy_files(clone_flags, p)))
            goto bad_fork_cleanup_semundo;
        if ((retval = copy_fs(clone_flags, p)))
            goto bad_fork_cleanup_files;
        if ((retval = copy_sighand(clone_flags, p)))
            goto bad_fork_cleanup_fs;
        if ((retval = copy_signal(clone_flags, p)))
            goto bad_fork_cleanup_sighand;
        if ((retval = copy_mm(clone_flags, p)))
            goto bad_fork_cleanup_signal;
        if ((retval = copy_keys(clone_flags, p)))
            goto bad_fork_cleanup_mm;
        if ((retval = copy_namespace(clone_flags, p)))
            goto bad_fork_cleanup_keys;
        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
        if (retval)
            goto bad_fork_cleanup_namespace;
    
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
        p->robust_list = NULL;
    #ifdef CONFIG_COMPAT
        p->compat_robust_list = NULL;
    #endif
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
    
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
            p->sas_ss_sp = p->sas_ss_size = 0;
    
        /*
         * Syscall tracing should be turned off in the child regardless
         * of CLONE_PTRACE.
         */
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
    #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
    #endif
    
        /* Our parent execution domain becomes current domain
           These must match for thread signalling to apply */
           
        p->parent_exec_id = p->self_exec_id;
    
        /* ok, now we should be set up.. */
        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
        p->pdeath_signal = 0;
        p->exit_state = 0;
    
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
        INIT_LIST_HEAD(&p->ptrace_children);
        INIT_LIST_HEAD(&p->ptrace_list);
    
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
    
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
    
        /*
         * The task hasn't been attached yet, so its cpus_allowed mask will
         * not be changed, nor will its assigned CPU.
         *
         * The cpus_allowed mask of the parent may have changed after it was
         * copied first time - so re-copy it here, then check the child's CPU
         * to ensure it is on a valid CPU (and if not, just force it back to
         * parent's CPU). This avoids alot of nasty races.
         */
        p->cpus_allowed = current->cpus_allowed;
        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
                !cpu_online(task_cpu(p))))
            set_task_cpu(p, smp_processor_id());
    
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
            p->real_parent = current->real_parent;
        else
            p->real_parent = current;
        p->parent = p->real_parent;
    
        spin_lock(&current->sighand->siglock);
    
        /*
         * Process group and session signals need to be delivered to just the
         * parent before the fork or both the parent and the child after the
         * fork. Restart if a signal comes in before we add the new process to
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
          */
         recalc_sigpending();
        if (signal_pending(current)) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -ERESTARTNOINTR;
            goto bad_fork_cleanup_namespace;
        }
    
        if (clone_flags & CLONE_THREAD) {
            p->group_leader = current->group_leader;
            list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
    
            if (!cputime_eq(current->signal->it_virt_expires,
                    cputime_zero) ||
                !cputime_eq(current->signal->it_prof_expires,
                    cputime_zero) ||
                current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
                !list_empty(&current->signal->cpu_timers[0]) ||
                !list_empty(&current->signal->cpu_timers[1]) ||
                !list_empty(&current->signal->cpu_timers[2])) {
                /*
                 * Have child wake up on its first tick to check
                 * for process CPU timers.
                 */
                p->it_prof_expires = jiffies_to_cputime(1);
            }
        }
    
        /*
         * inherit ioprio
         */
        p->ioprio = current->ioprio;
    
        if (likely(p->pid)) {
            add_parent(p);
            if (unlikely(p->ptrace & PT_PTRACED))
                __ptrace_link(p, current->parent);
    
            if (thread_group_leader(p)) {
                p->signal->tty = current->signal->tty;
                p->signal->pgrp = process_group(current);
                p->signal->session = current->signal->session;
                attach_pid(p, PIDTYPE_PGID, process_group(p));
                attach_pid(p, PIDTYPE_SID, p->signal->session);
    
                list_add_tail_rcu(&p->tasks, &init_task.tasks);
                __get_cpu_var(process_counts)++;
            }
            attach_pid(p, PIDTYPE_PID, p->pid);
            nr_threads++;
        }
    
        total_forks++;
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        return p;
    
    bad_fork_cleanup_namespace:
        exit_namespace(p);
    bad_fork_cleanup_keys:
        exit_keys(p);
    bad_fork_cleanup_mm:
        if (p->mm)
            mmput(p->mm);
    bad_fork_cleanup_signal:
        cleanup_signal(p);
    bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
    bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
    bad_fork_cleanup_files:
        exit_files(p); /* blocking */
    bad_fork_cleanup_semundo:
        exit_sem(p);
    bad_fork_cleanup_audit:
        audit_free(p);
    bad_fork_cleanup_security:
        security_task_free(p);
    bad_fork_cleanup_policy:
    #ifdef CONFIG_NUMA
        mpol_free(p->mempolicy);
    bad_fork_cleanup_cpuset:
    #endif
        cpuset_exit(p);
    bad_fork_cleanup_delays_binfmt:
        delayacct_tsk_free(p);
        if (p->binfmt)
            module_put(p->binfmt->module);
    bad_fork_cleanup_put_domain:
        module_put(task_thread_info(p)->exec_domain->module);
    bad_fork_cleanup_count:
        put_group_info(p->group_info);
        atomic_dec(&p->user->processes);
        free_uid(p->user);
    bad_fork_free:
        free_task(p);
    fork_out:
        return ERR_PTR(retval);
    }
    copy_process

    函数执行步骤如下:

    1、首先判断clone_flags的各种位

    • CLONE_NEWNS和CLONE_FS标志不能同时被设置
    • CLONE_THREAD表示把子进程插入到父进程同一线程组中,CLONE_SIGHAND共享信号处理的表、阻塞信号的表、挂起信号的表。
    • 设置了CLONE_SIGHAND位必须设置CLONE_VM位,共享内存描述符和所有页表

    2、security_task_create是钩子函数

    3、dup_task_struct函数为子进程获取进程描述符

    static struct task_struct *dup_task_struct(struct task_struct *orig)
    {
        struct task_struct *tsk;
        struct thread_info *ti;
    
        prepare_to_copy(orig);
    
        tsk = alloc_task_struct();
        if (!tsk)
            return NULL;
    
        ti = alloc_thread_info(tsk);
        if (!ti) {
            free_task_struct(tsk);
            return NULL;
        }
    
        *tsk = *orig;
        tsk->thread_info = ti;
        setup_thread_stack(tsk, orig);
    
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
        atomic_set(&tsk->fs_excl, 0);
        tsk->btrace_seq = 0;
        tsk->splice_pipe = NULL;
        return tsk;
    }
    dup_task_struct

    dup_task_struct函数执行如下步骤:

    a、prepare_to_copy定义在archi386kernelprocess.c之中,直接调用unlazy_fpu,在includeasm-i386i387.h中,所做的工作是将FPU、MMX、SSESSE2寄存器的内容保存到父进程的thread_info之中。稍后会将他们复制到子进程之中。

    b、调用了alloc_task_struct复制了一个进程描述符,在fork.c之中

    # define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
    alloc_task_struct

    c、接着是includeasm-i386 hread_info.h中的alloc_thread_info,分配一个新的进程描述符。

    #define alloc_thread_info(tsk)                    
        ({                            
            struct thread_info *ret;            
                                    
            ret = kmalloc(THREAD_SIZE, GFP_KERNEL);        
            if (ret)                    
                memset(ret, 0, THREAD_SIZE);        
            ret;                        
        })
    #else
    alloc_thread_info

    c、接着是复制了整个进程描述符的内容。

    d、调用setup_thread_stack函数,在includelinuxsched.h之中,注意这个函数的参数,第一个是新建的进程描述符指针tsk,但是此时还是指向了原进程描述符的内容,只是tsk的thread_info结构重新建了一个。第二个参数是当前调用进程的进程描述符指针。

    static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
    {
        *task_thread_info(p) = *task_thread_info(org);
        task_thread_info(p)->task = p;
    }
    setup_thread_stack

    可以看到其实这里的thread_info结构其实也只是做了简单的数据内容复制。然后将thread_info中的task指针指向了新建的task_struct结构。

    e、uasge表示该进程描述符的使用次数,设为2,而且进程处于活动状态

    进程描述符的创建就结束了,下面回到copy_process函数中去。

    4、使用p->user->processes和p->signal->rlim[RLIMIT_NPROC].rlim_cur进行比较。signal->rlim数组已经描述过,表示资源限制,RLIMIT_NPROC表示最大进程数。user是定义在进程描述符同一文件中的user_struct结构,表示一个用户的多进程间共享的用户信息。这里比较用户当前拥有进程数是否大于最大进程数。至于这个capability又是另外一块了

    5、递增引用数和拥有进程数

    6、get_group_info就是递增了一下group_info的引用,应该是进程组共享的信息,在includelinuxsched.h之中

    7、拿nr_threads和max_threads来比较,看系统进程数目是否大于最大值,原则是所有进程的内核栈空间不能超过物理内存的1/8。

    8、exec_domin是执行域,和模块等相关概念可以深入理解一下。

    9、did_exec记录的是发出exec的次数,置为0。然后调用copy_flags更改了一些标志位。

    static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
    {
        unsigned long new_flags = p->flags;
    
        new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
        new_flags |= PF_FORKNOEXEC;
        if (!(clone_flags & CLONE_PTRACE))
            p->ptrace = 0;
        p->flags = new_flags;
    }
    copy_flags

    相关标志位在 includelinuxsched.h中,PF_SUPERPRIV表示使用了超级用户权限,PF_NOFREEZE表示进程不能被冻结,PF_FORKNOEXEC表示没有发出过exec系统调用。CLONE_PTRACE表示跟踪新建的子进程,并且进程描述符中的ptrace字段表示进程被跟踪。

    10、设置pid,设置了CLONE_PARENT_SETTID位的话,将子进程的pid写入到指针处(该地址是clone传入的参数,有do_fork传给copy_process,是父进程用户空间的一块地址)

    11、vfork_done是在vfork是时候使用的一个指针,在前面的do_fork有涉及到,是有关调度的。

    12、初始化进程描述符中的children和sibling指针,使用INIT_LIST_HEAD函数

    static inline void INIT_LIST_HEAD(struct list_head *list)
    {
        list->next = list;
        list->prev = list;
    }
    INIT_LIST_HEAD

    13、初始化allo_lock自旋锁、取消信号标志位、将私有信号队列初始化、初始化各种时间变量

    14、

    15、进程的线程组号设为进程pid号,这说明新建的进程而不是线程,如果是新建线程,则设置CLONE_THREAD位,tgid的值就应该设为父进程的pid号。设置了CLONE_THREAD位,就应该设置信号共享位,同时应该设置CLONE_VM位,共享内存描述符和所有页表。

    16、接着是调用了很多的copy函数,就是创建一下新的结构。

    17、看copy_thread函数,用clone传入的寄存器值和clone_flags去初始化子进程的内核栈。

    int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
        unsigned long unused,
        struct task_struct * p, struct pt_regs * regs)
    {
        struct pt_regs * childregs;
        struct task_struct *tsk;
        int err;
    
        childregs = task_pt_regs(p);
        *childregs = *regs;
        childregs->eax = 0;
        childregs->esp = esp;
    
        p->thread.esp = (unsigned long) childregs;
        p->thread.esp0 = (unsigned long) (childregs+1);
    
        p->thread.eip = (unsigned long) ret_from_fork;
    
        savesegment(fs,p->thread.fs);
        savesegment(gs,p->thread.gs);
    
        tsk = current;
        if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
            p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
            if (!p->thread.io_bitmap_ptr) {
                p->thread.io_bitmap_max = 0;
                return -ENOMEM;
            }
            memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
                IO_BITMAP_BYTES);
            set_tsk_thread_flag(p, TIF_IO_BITMAP);
        }
    
        /*
         * Set a new TLS for the child thread?
         */
        if (clone_flags & CLONE_SETTLS) {
            struct desc_struct *desc;
            struct user_desc info;
            int idx;
    
            err = -EFAULT;
            if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
                goto out;
            err = -EINVAL;
            if (LDT_empty(&info))
                goto out;
    
            idx = info.entry_number;
            if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
                goto out;
    
            desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
            desc->a = LDT_entry_a(&info);
            desc->b = LDT_entry_b(&info);
        }
    
        err = 0;
     out:
        if (err && p->thread.io_bitmap_ptr) {
            kfree(p->thread.io_bitmap_ptr);
            p->thread.io_bitmap_max = 0;
        }
        return err;
    }
    copy_thread

     a、调用了task_pt_regs函数,该函数返回了内核栈中保存的pt_regs地址。该函数定义在includeasm-i386process.h之中。

    /*
     * The below -8 is to reserve 8 bytes on top of the ring0 stack.
     * This is necessary to guarantee that the entire "struct pt_regs"
     * is accessable even if the CPU haven't stored the SS/ESP registers
     * on the stack (interrupt gate does not save these registers
     * when switching to the same priv ring).
     * Therefore beware: accessing the xss/esp fields of the
     * "struct pt_regs" is possible, but they may contain the
     * completely wrong values.
     */
    #define task_pt_regs(task)                                             
    ({                                                                     
           struct pt_regs *__regs__;                                       
           __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); 
           __regs__ - 1;                                                   
    })
    
    #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
    #define KSTK_TOP(info)                                                 
    ({                                                                     
           unsigned long *__ptr = (unsigned long *)(info);                 
           (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     
    })
    task_pt_regs

    其中KSTK_TOP就是把指针调整到栈底,这里用了数组的方式应该是为了来适配不同的long所占的不同字节数,这里THREAD_SIZE一般是两个页,有几个函数和宏定义在includelinuxsched.h,不多赘述了。

    这个地方需要特别注意的地方是减8这个操作,上面的代码中有一段注释,说是在中断门的时候系统不会自动保存ss和esp寄存器,为了能完整的访问整个结构体,所以预留下了8个字节的空位。看includeasm-i386ptrace.h中关于pt_regs的定义,发现ss和esp正好是最后两个,即靠近栈底的一边。

    struct pt_regs {
        long ebx;
        long ecx;
        long edx;
        long esi;
        long edi;
        long ebp;
        long eax;
        int  xds;
        int  xes;
        long orig_eax;
        long eip;
        int  xcs;
        long eflags;
        long esp;
        int  xss;
    };
    pt_regs

    b、复制整个寄存器的内容,但是eax的值置为0,这是为了让子进程返回的时候不返回pid的值,fork使用的时候就有这个规则。

    c、接着是更改了子进程的sp等一系列进程,其中子进程设置成了ret_from_fork函数,汇编函数定义在archi386kernelentry.S之中。

    d、下面的savesegment定义在includeasm-i386system.h中

    #define savesegment(seg, value) 
        asm volatile("mov %%" #seg ",%0":"=rm" (value))
    savesegment

    也就是保存段寄存器到thread对应位置中去。

    e、关于io位图的后面再看

    f、下面一段是创建TLS。

    copy_thread完了,回到copy_process函数

    18、CLONE_CHILD_SETTID和前面的CLONE_PARENT_SETTID类似,如果设置了还要在子进程的用户空间的某个地址写入pid,这个地址也是和parent一致,由clone调用传入。

    19、

    20、设置信号处理栈的地方,在vfork调用的时候是同时设置了CLONE_VFORK和CLONE_VM,这里要求只使用了CLONE_VM,还不明白有哪些情况。以及为什么。

    21、TIF_SYSCALL_TRACE这个位跟踪系统调用。

    22下面是初始化调度的部分。

    23、recalc_sigpending用来查看当前是否有信号,如果有,则此时还不能将新创建的进程加入到进程组,fork发出之前的信号不应该被传送给新的进程。

    24、后面实在看不下去了,字段都完全不明白,先留着,后面在再做打算。

    2.4 exit和exit_group系统调用退出进程

    先看exit的实现,sys_exit定义在kernelexit.c中

    asmlinkage long sys_exit(int error_code)
    {
        do_exit((error_code&0xff)<<8);
    }
    sys_exit

    调用了do_exit函数来执行,实际上所有进程终止都是使用do_exit函数,包括exit_group的实现。

    fastcall NORET_TYPE void do_exit(long code)
    {
        struct task_struct *tsk = current;
        struct taskstats *tidstats;
        int group_dead;
        unsigned int mycpu;
    
        profile_task_exit(tsk);
    
        WARN_ON(atomic_read(&tsk->fs_excl));
    
        if (unlikely(in_interrupt()))
            panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
            panic("Attempted to kill the idle task!");
        if (unlikely(tsk == child_reaper))
            panic("Attempted to kill init!");
    
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
            current->ptrace_message = code;
            ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
        }
    
        /*
         * We're taking recursive faults here in do_exit. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
            printk(KERN_ALERT
                "Fixing recursive fault but reboot is needed!
    ");
            if (tsk->io_context)
                exit_io_context();
            set_current_state(TASK_UNINTERRUPTIBLE);
            schedule();
        }
    
        tsk->flags |= PF_EXITING;
    
        if (unlikely(in_atomic()))
            printk(KERN_INFO "note: %s[%d] exited with preempt_count %d
    ",
                    current->comm, current->pid,
                    preempt_count());
    
        taskstats_exit_alloc(&tidstats, &mycpu);
    
        acct_update_integrals(tsk);
        if (tsk->mm) {
            update_hiwater_rss(tsk->mm);
            update_hiwater_vm(tsk->mm);
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
             hrtimer_cancel(&tsk->signal->real_timer);
            exit_itimers(tsk->signal);
        }
        acct_collect(code, group_dead);
        if (unlikely(tsk->robust_list))
            exit_robust_list(tsk);
    #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
        if (unlikely(tsk->compat_robust_list))
            compat_exit_robust_list(tsk);
    #endif
        if (unlikely(tsk->audit_context))
            audit_free(tsk);
        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
        taskstats_exit_free(tidstats);
    
        exit_mm(tsk);
    
        if (group_dead)
            acct_process();
        exit_sem(tsk);
        __exit_files(tsk);
        __exit_fs(tsk);
        exit_namespace(tsk);
        exit_thread();
        cpuset_exit(tsk);
        exit_keys(tsk);
    
        if (group_dead && tsk->signal->leader)
            disassociate_ctty(1);
    
        module_put(task_thread_info(tsk)->exec_domain->module);
        if (tsk->binfmt)
            module_put(tsk->binfmt->module);
    
        tsk->exit_code = code;
        proc_exit_connector(tsk);
        exit_notify(tsk);
    #ifdef CONFIG_NUMA
        mpol_free(tsk->mempolicy);
        tsk->mempolicy = NULL;
    #endif
        /*
         * This must happen late, after the PID is not
         * hashed anymore:
         */
        if (unlikely(!list_empty(&tsk->pi_state_list)))
            exit_pi_state_list(tsk);
        if (unlikely(current->pi_state_cache))
            kfree(current->pi_state_cache);
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held(tsk);
    
        if (tsk->io_context)
            exit_io_context();
    
        if (tsk->splice_pipe)
            __free_pipe_info(tsk->splice_pipe);
    
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);
        tsk->flags |= PF_DEAD;
    
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
        for (;;) ;
    }
    do_exit
  • 相关阅读:
    hdu 5646 DZY Loves Partition
    bzoj 1001 狼抓兔子 平面图最小割
    poj 1815 Friendship 最小割 拆点 输出字典序
    spoj 1693 Coconuts 最小割 二者取其一式
    hdu 5643 King's Game 约瑟夫环变形
    约瑟夫环问题
    hdu 5642 King's Order
    CodeForces 631C Report
    1039: C语言程序设计教程(第三版)课后习题9.4
    1043: C语言程序设计教程(第三版)课后习题10.1
  • 原文地址:https://www.cnblogs.com/likaiming/p/8659824.html
Copyright © 2011-2022 走看看