zoukankan      html  css  js  c++  java
  • linux-2.6.18源码分析笔记---进程

    一、进程重要字段描述

    在目录includelinuxsched.h下定义了进程描述符task_struct,关注如下字段:

    进程状态

    volatile long state:表示进程状态,该文件头部有几种状态的取值。

    long exit_state:表示进程退出状态,下面的定义中前缀为EXIT的表示该字段取值,表示进程退出状态

    /*
     * Task state bitmask. NOTE! These bits are also
     * encoded in fs/proc/array.c: get_task_state().
     *
     * We have two separate sets of flags: task->state
     * is about runnability, while task->exit_state are
     * about the task exiting. Confusing, but this way
     * modifying one set can't modify the other one by
     * mistake.
     */
    #define TASK_RUNNING        0
    #define TASK_INTERRUPTIBLE    1
    #define TASK_UNINTERRUPTIBLE    2
    #define TASK_STOPPED        4
    #define TASK_TRACED        8
    /* in tsk->exit_state */
    #define EXIT_ZOMBIE        16
    #define EXIT_DEAD        32
    /* in tsk->state again */
    #define TASK_NONINTERACTIVE    64
    
    #define __set_task_state(tsk, state_value)        
        do { (tsk)->state = (state_value); } while (0)
    #define set_task_state(tsk, state_value)        
        set_mb((tsk)->state, (state_value))
    
    /*
     * set_current_state() includes a barrier so that the write of current->state
     * is correctly serialised wrt the caller's subsequent test of whether to
     * actually sleep:
     *
     *    set_current_state(TASK_UNINTERRUPTIBLE);
     *    if (do_i_need_to_sleep())
     *        schedule();
     *
     * If the caller does not need such serialisation then use __set_current_state()
     */
    #define __set_current_state(state_value)            
        do { current->state = (state_value); } while (0)
    #define set_current_state(state_value)        
        set_mb(current->state, (state_value))
    statue取值和操作

    标志一个进程

    pid_t pid;
    pid_t tgid;

    用process id,即pid来标志一个进程。POSIX规定一个线程组中所有线程都必须有同一个pid(线程在linux中也被叫做轻量级进程),在linux中所有线程使用该线程组的领头线程(thread group leader)的pid,存放在tgid之中。getpid系统调用返回的是当前进程的tgid而不是pid

    线程描述符

    struct thread_info *thread_info;

    该结构和进程的栈存放在两个连续的页框之内,thread_info放在栈增长方向的顶部,可以根据esp指针找到thread_info(如果栈是8K,那么只需要屏蔽掉esp的后13位),同时根据thread_info中的进程描述符可以快速找到进程的进程描述符。

    thread_info定义在在includeasm-i386 hread_info.h之中,该文件还定义了一些对于thread_info的操作函数。

    struct thread_info {
        struct task_struct    *task;        /* main task structure */
        struct exec_domain    *exec_domain;    /* execution domain */
        unsigned long        flags;        /* low level flags */
        unsigned long        status;        /* thread-synchronous flags */
        __u32            cpu;        /* current CPU */
        int            preempt_count;    /* 0 => preemptable, <0 => BUG */
    
    
        mm_segment_t        addr_limit;    /* thread address space:
                                0-0xBFFFFFFF for user-thead
                               0-0xFFFFFFFF for kernel-thread
                            */
        void            *sysenter_return;
        struct restart_block    restart_block;
    
        unsigned long           previous_esp;   /* ESP of the previous stack in case
                               of nested (IRQ) stacks
                            */
        __u8            supervisor_stack[0];
    };
    thread_info

    在includelinuxsched.h还定义了thread_union,用来方便的描述线程描述符和内核栈

    union thread_union {
        struct thread_info thread_info;
        unsigned long stack[THREAD_SIZE/sizeof(long)];
    };
    thread_union

    进程链表

    struct list_head tasks; 连接所有的进程描述符

    在includelinuxlist.h之中定义了linux的两种链表list_head和hlist_head、hlist_node和相关操作。再次不赘述,看task_struct中部分链表

    在includelinuxsched.h还定义了用来遍历整个进程描述符的宏for_each_process,初始进程init_task

    运行进程的链表

    struct list_head run_list;

    这个字段在调度的时候使用,当寻找下一个可运行的进程的时候就使用这个字段

    进程间的关系

    父子关系涉及到如下4个字段:

    struct task_struct *real_parent;  父进程
    struct task_struct *parent;  在发出ptrace调用时和real_parent不一致
    struct list_head children;  子进程链表的头
    struct list_head sibling;  子进程链表的下一个子进程

    而进程间有非亲属关系,涉及的字段如下:

    struct task_struct *group_leader;  指向线程组的进程描述符

    struct list_head thread_group;  线程组的链表

    pid_t signal->pgrp;  所在线程组领头线程的PID

    pid_t signal->session;  会话领头进程的pid

    struct list_head ptrace_children;  跟踪子进程的链表头

    struct list_head ptrace_list;  跟踪子进程的链表节点

    进程的等待队列

    在includelinuxwait.h中定义了两个结构__wait_queue_head和__wait_queue,分别表示等待队列的头和等待队列的节点

    struct __wait_queue_head {
        spinlock_t lock;
        struct list_head task_list;
    };
    __wait_queue_head
    struct __wait_queue {
        unsigned int flags;
    #define WQ_FLAG_EXCLUSIVE    0x01
        void *private;
        wait_queue_func_t func;
        struct list_head task_list;
    };
    __wait_queue

     __wait_queue_head中包含一个自旋锁和一个指针

    __wait_queue中第一个参数flags表示互斥(进程等待互斥的访问同一资源)还是非互斥进程(比如等待磁盘传输结束的所有进程),第二个参数为void指针,指向的应该是下一个进程的进程描述符(书上这一部分写的是task_struct指针,不清楚到这一版本怎么改成了这个),func字段表示的是如何唤醒进程(类型定义也在该文件之中),task_list字段把等待相同事件的进程串联起来

    该文件中还定义了很多等待队列的操作,具体看文件,不再赘述

    进程资源限制

    在进程描述符中有一个信号描述符

    struct signal_struct *signal;

    该结构同样定义在本文件中,在signal_struct中有一个如下字段,在信号描述符中似乎还定义了很多和信号无关的数据,这里的资源限制,上面的一些pid值,我还不是很清楚为什么要这么做,是应为锁的缘故吗?信号描述符好像是没有锁的,而锁在signal_handle之中?

    struct rlimit rlim[RLIM_NLIMITS];

    struct rlimit定义在includelinux esource.h之中,两个字段分别表示当前资源数和最大资源数

    struct rlimit {
        unsigned long    rlim_cur;
        unsigned long    rlim_max;
    };
    rlimit

    在includeasm-generic esource.h之中的定义了很多的以RLIMIT开头宏,用来访问各种资源的限制

    进程所属用户信息

    在includelinuxsched.h之中,定义了一个user_struct的结构。一个用户不止拥有一个进程,多个进程间可以通过user_struct结构来共享用户的信息

    struct user_struct {
        atomic_t __count;    /* reference count */
        atomic_t processes;    /* How many processes does this user have? */
        atomic_t files;        /* How many open files does this user have? */
        atomic_t sigpending;    /* How many pending signals does this user have? */
    #ifdef CONFIG_INOTIFY_USER
        atomic_t inotify_watches; /* How many inotify watches does this user have? */
        atomic_t inotify_devs;    /* How many inotify devs does this user have opened? */
    #endif
        /* protected by mq_lock    */
        unsigned long mq_bytes;    /* How many bytes can be allocated to mqueue? */
        unsigned long locked_shm; /* How many pages of mlocked shm ? */
    
    #ifdef CONFIG_KEYS
        struct key *uid_keyring;    /* UID specific keyring */
        struct key *session_keyring;    /* UID's default session keyring */
    #endif
    
        /* Hash table maintenance information */
        struct list_head uidhash_list;
        uid_t uid;
    };
    user_struct

    进程切换时硬件上下文的保存

    不是如intel所设计的那样,为每个进程设置了tss字段,linux为一个cpu保留一个tss段,当发生异常和中断的时候,会根据tss中字段做一些操作,操作系统将硬件上下文保存在进程描述符的一个字段中,当进程被切换上cpu时,使用下面字段的部分值去更改tss段。

    struct thread_struct thread;

    thread_struct定义在includeasm-i386processor.h之中

    struct thread_struct {
    /* cached TLS descriptors. */
        struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
        unsigned long    esp0;
        unsigned long    sysenter_cs;
        unsigned long    eip;
        unsigned long    esp;
        unsigned long    fs;
        unsigned long    gs;
    /* Hardware debugging registers */
        unsigned long    debugreg[8];  /* %%db0-7 debug registers */
    /* fault info */
        unsigned long    cr2, trap_no, error_code;
    /* floating point info */
        union i387_union    i387;
    /* virtual 86 mode info */
        struct vm86_struct __user * vm86_info;
        unsigned long        screen_bitmap;
        unsigned long        v86flags, v86mask, saved_esp0;
        unsigned int        saved_fs, saved_gs;
    /* IO permissions */
        unsigned long    *io_bitmap_ptr;
         unsigned long    iopl;
    /* max allowed port in the bitmap, in bytes: */
        unsigned long    io_bitmap_max;
    };
    thread_struct

    二、重要流程描述

    2.1 fork、vfork、clone系统调用,创建一个子进程

    fork、vfork、clone调用的系统服务例程在archi386process.c之中,该文件还包括进程的其他系统调用的服务例程如exec等。

    asmlinkage int sys_fork(struct pt_regs regs)
    {
        return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
    }
    sys_fork
    asmlinkage int sys_clone(struct pt_regs regs)
    {
        unsigned long clone_flags;
        unsigned long newsp;
        int __user *parent_tidptr, *child_tidptr;
    
        clone_flags = regs.ebx;
        newsp = regs.ecx;
        parent_tidptr = (int __user *)regs.edx;
        child_tidptr = (int __user *)regs.edi;
        if (!newsp)
            newsp = regs.esp;
        return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
    }
    sys_clone
    asmlinkage int sys_vfork(struct pt_regs regs)
    {
        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
    }
    sys_vfork

    其调用的do_fork在kernelfork.c之中。

    long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
    {
        struct task_struct *p;
        int trace = 0;
        struct pid *pid = alloc_pid();
        long nr;
    
        if (!pid)
            return -EAGAIN;
        nr = pid->nr;
        if (unlikely(current->ptrace)) {
            trace = fork_traceflag (clone_flags);
            if (trace)
                clone_flags |= CLONE_PTRACE;
        }
    
        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        if (!IS_ERR(p)) {
            struct completion vfork;
    
            if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
            }
    
            if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
                /*
                 * We'll start up with an immediate SIGSTOP.
                 */
                sigaddset(&p->pending.signal, SIGSTOP);
                set_tsk_thread_flag(p, TIF_SIGPENDING);
            }
    
            if (!(clone_flags & CLONE_STOPPED))
                wake_up_new_task(p, clone_flags);
            else
                p->state = TASK_STOPPED;
    
            if (unlikely (trace)) {
                current->ptrace_message = nr;
                ptrace_notify ((trace << 8) | SIGTRAP);
            }
    
            if (clone_flags & CLONE_VFORK) {
                wait_for_completion(&vfork);
                if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
                    current->ptrace_message = nr;
                    ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
                }
            }
        } else {
            free_pid(pid);
            nr = PTR_ERR(p);
        }
        return nr;
    }
    do_fork

    do_fork有6个参数,含义如下:

    1、clone_flags:各种信息,分为两部分,最低的字节为信号类型,用来规定子进程去世时应该向父进程发出的信号,高位是在includelinuxsched.h之中定义了很多以CLONE_开头的宏,代表不同的含义(具体见文件内,这里不再描述)。在sys_fork调用do_fork时使用了信号SIGCHLD,该信号是子进程停止、结束、或是在被跟踪时获得。

    2、stack_start:将父进程用户态栈的地址传入

    3、regs:寄存器值

    4、stack_size:栈大小

    5、parent_tidptr:创建子进程后,将子进程的pid写到该地址,该指针指向父进程的一块地址

    6、child_tidptr:

    接着分析do_fork所做的工作

    a、调用了alloc_pid函数分配了一个pid。(接下来需要具体分析各个pid的情况,linux内核设计与实现进程篇里有一点信息,见kernelpid.c文件)

    b、查看进程的ptrace字段,如果置位表示有进程想跟踪子进程,则置位clone_flags的CLONE_PTRACE位。(查看fork_traceflag函数可知跟踪子进程对vfork、fork、clone都做了区分,还涉及到SIGCHLD的判断)

    c、调用了copy_process函数复制进程,返回进程的进程描述符。copy_process是一个比较复杂的函数,下面单独看看。

    d、接着使用IS_ERR宏判断copy_process返回的指针是否正确(是否位于最后一个页),该宏定义在includelinuxerr.h中,包括下面用到的PTR_ERR宏(将指针转成错误号),用来处理指针错误,可直接搜索这个字段。

    e、如果必须要跟踪子进程,即设置了PT_PTRACED,或者设置了CLONE_STOPPED,则给子进程增加一个SIGSTOP信号,并设置信号标志位

    f、如果没有设置CLONE_STOPPED,则调用wake_up_new_task直接唤醒新的进程,函数实现在kernelsched.c之中,

    g、如果设置了CLONE_STOPPED,则进程状态设置为TASK_STOPPED

    h、如果设置了trace,则把子进程的pid放入父进程的ptrace_message,并调用ptrace_notify,这是信号部分的函数,在kernelsignal.c中,它使当前进程停止运行,向当前进程的父进程,也就是debugger进程发送SIGTRAP信号,并且可以在ptrace_message中找到子进程的pid。

    i、如果设置了CLONE_VFORK,则让父进程挂起,直到子进程结束。接下来这段代码应该停下来了,直到子进程执行完下面的代码才会被执行,下面的内容还是执行ptrace_notify函数。

    2.2 再看fork、vfork、clone系统调用的区别

    clone系统调用是功能最为强大的一个,他调用的do_fork函数所有参数都来自于调用者传入,可以实现任何程度上的进程复制。

    fork系统调用无参数,它的clone_flags被置为空,值规定了子进程退出时向父进程发送SIGCHLD信号。同时还有写时复制的原因在里面。

    vfork系统调用则在fork系统调用的基础上加了CLONE_VFORK和CLONE_VM位,CLONE_VM位表示共享内存描述符和所有页表,即共享了所有的地址空间,同时CLONE_VFORK表示子进程运行时让父进程挂起,直到子进程结束。在网上看到vfork系统调用出来的子进程不能使用return,也不能使用exit,但能使用_exit,不知道为什么,还需要思考一下。是因为子进程使用return和exit之后会释放掉父进程的栈空间,导致父进程不能继续执行了吗?也就是说子进程和父进程的执行步骤还是不一样的,到底有哪些事件一样,哪些事件不一样还需要好好看一看exit和_exit的区别。

    2.3 copy_process函数复制进程

    /*
     * This creates a new process as a copy of the old one,
     * but does not actually start it yet.
     *
     * It copies the registers, and all the appropriate
     * parts of the process environment (as per the clone
     * flags). The actual kick-off is left to the caller.
     */
    static struct task_struct *copy_process(unsigned long clone_flags,
                        unsigned long stack_start,
                        struct pt_regs *regs,
                        unsigned long stack_size,
                        int __user *parent_tidptr,
                        int __user *child_tidptr,
                        int pid)
    {
        int retval;
        struct task_struct *p = NULL;
    
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
            return ERR_PTR(-EINVAL);
    
        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
            return ERR_PTR(-EINVAL);
    
        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
            return ERR_PTR(-EINVAL);
    
        retval = security_task_create(clone_flags);
        if (retval)
            goto fork_out;
    
        retval = -ENOMEM;
        p = dup_task_struct(current);
        if (!p)
            goto fork_out;
    
    #ifdef CONFIG_TRACE_IRQFLAGS
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
    #endif
        retval = -EAGAIN;
        if (atomic_read(&p->user->processes) >=
                p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
            if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->user != &root_user)
                goto bad_fork_free;
        }
    
        atomic_inc(&p->user->__count);
        atomic_inc(&p->user->processes);
        get_group_info(p->group_info);
    
        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        if (nr_threads >= max_threads)
            goto bad_fork_cleanup_count;
    
        if (!try_module_get(task_thread_info(p)->exec_domain->module))
            goto bad_fork_cleanup_count;
    
        if (p->binfmt && !try_module_get(p->binfmt->module))
            goto bad_fork_cleanup_put_domain;
    
        p->did_exec = 0;
        delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p);
        p->pid = pid;
        retval = -EFAULT;
        if (clone_flags & CLONE_PARENT_SETTID)
            if (put_user(p->pid, parent_tidptr))
                goto bad_fork_cleanup_delays_binfmt;
    
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);
    
        clear_tsk_thread_flag(p, TIF_SIGPENDING);
        init_sigpending(&p->pending);
    
        p->utime = cputime_zero;
        p->stime = cputime_zero;
         p->sched_time = 0;
        p->rchar = 0;        /* I/O counter: bytes read */
        p->wchar = 0;        /* I/O counter: bytes written */
        p->syscr = 0;        /* I/O counter: read syscalls */
        p->syscw = 0;        /* I/O counter: write syscalls */
        acct_clear_integrals(p);
    
         p->it_virt_expires = cputime_zero;
        p->it_prof_expires = cputime_zero;
         p->it_sched_expires = 0;
         INIT_LIST_HEAD(&p->cpu_timers[0]);
         INIT_LIST_HEAD(&p->cpu_timers[1]);
         INIT_LIST_HEAD(&p->cpu_timers[2]);
    
        p->lock_depth = -1;        /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->security = NULL;
        p->io_context = NULL;
        p->io_wait = NULL;
        p->audit_context = NULL;
        cpuset_fork(p);
    #ifdef CONFIG_NUMA
         p->mempolicy = mpol_copy(p->mempolicy);
         if (IS_ERR(p->mempolicy)) {
             retval = PTR_ERR(p->mempolicy);
             p->mempolicy = NULL;
             goto bad_fork_cleanup_cpuset;
         }
        mpol_fix_fork_child_flag(p);
    #endif
    #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
        p->hardirqs_enabled = 0;
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
        p->hardirq_disable_event = 0;
        p->softirqs_enabled = 1;
        p->softirq_enable_ip = _THIS_IP_;
        p->softirq_enable_event = 0;
        p->softirq_disable_ip = 0;
        p->softirq_disable_event = 0;
        p->hardirq_context = 0;
        p->softirq_context = 0;
    #endif
    #ifdef CONFIG_LOCKDEP
        p->lockdep_depth = 0; /* no locks held yet */
        p->curr_chain_key = 0;
        p->lockdep_recursion = 0;
    #endif
    
        rt_mutex_init_task(p);
    
    #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
    #endif
    
        p->tgid = p->pid;
        if (clone_flags & CLONE_THREAD)
            p->tgid = current->tgid;
    
        if ((retval = security_task_alloc(p)))
            goto bad_fork_cleanup_policy;
        if ((retval = audit_alloc(p)))
            goto bad_fork_cleanup_security;
        /* copy all the process information */
        if ((retval = copy_semundo(clone_flags, p)))
            goto bad_fork_cleanup_audit;
        if ((retval = copy_files(clone_flags, p)))
            goto bad_fork_cleanup_semundo;
        if ((retval = copy_fs(clone_flags, p)))
            goto bad_fork_cleanup_files;
        if ((retval = copy_sighand(clone_flags, p)))
            goto bad_fork_cleanup_fs;
        if ((retval = copy_signal(clone_flags, p)))
            goto bad_fork_cleanup_sighand;
        if ((retval = copy_mm(clone_flags, p)))
            goto bad_fork_cleanup_signal;
        if ((retval = copy_keys(clone_flags, p)))
            goto bad_fork_cleanup_mm;
        if ((retval = copy_namespace(clone_flags, p)))
            goto bad_fork_cleanup_keys;
        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
        if (retval)
            goto bad_fork_cleanup_namespace;
    
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
        p->robust_list = NULL;
    #ifdef CONFIG_COMPAT
        p->compat_robust_list = NULL;
    #endif
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
    
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
            p->sas_ss_sp = p->sas_ss_size = 0;
    
        /*
         * Syscall tracing should be turned off in the child regardless
         * of CLONE_PTRACE.
         */
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
    #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
    #endif
    
        /* Our parent execution domain becomes current domain
           These must match for thread signalling to apply */
           
        p->parent_exec_id = p->self_exec_id;
    
        /* ok, now we should be set up.. */
        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
        p->pdeath_signal = 0;
        p->exit_state = 0;
    
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
        INIT_LIST_HEAD(&p->ptrace_children);
        INIT_LIST_HEAD(&p->ptrace_list);
    
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
    
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
    
        /*
         * The task hasn't been attached yet, so its cpus_allowed mask will
         * not be changed, nor will its assigned CPU.
         *
         * The cpus_allowed mask of the parent may have changed after it was
         * copied first time - so re-copy it here, then check the child's CPU
         * to ensure it is on a valid CPU (and if not, just force it back to
         * parent's CPU). This avoids alot of nasty races.
         */
        p->cpus_allowed = current->cpus_allowed;
        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
                !cpu_online(task_cpu(p))))
            set_task_cpu(p, smp_processor_id());
    
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
            p->real_parent = current->real_parent;
        else
            p->real_parent = current;
        p->parent = p->real_parent;
    
        spin_lock(&current->sighand->siglock);
    
        /*
         * Process group and session signals need to be delivered to just the
         * parent before the fork or both the parent and the child after the
         * fork. Restart if a signal comes in before we add the new process to
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
          */
         recalc_sigpending();
        if (signal_pending(current)) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -ERESTARTNOINTR;
            goto bad_fork_cleanup_namespace;
        }
    
        if (clone_flags & CLONE_THREAD) {
            p->group_leader = current->group_leader;
            list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
    
            if (!cputime_eq(current->signal->it_virt_expires,
                    cputime_zero) ||
                !cputime_eq(current->signal->it_prof_expires,
                    cputime_zero) ||
                current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
                !list_empty(&current->signal->cpu_timers[0]) ||
                !list_empty(&current->signal->cpu_timers[1]) ||
                !list_empty(&current->signal->cpu_timers[2])) {
                /*
                 * Have child wake up on its first tick to check
                 * for process CPU timers.
                 */
                p->it_prof_expires = jiffies_to_cputime(1);
            }
        }
    
        /*
         * inherit ioprio
         */
        p->ioprio = current->ioprio;
    
        if (likely(p->pid)) {
            add_parent(p);
            if (unlikely(p->ptrace & PT_PTRACED))
                __ptrace_link(p, current->parent);
    
            if (thread_group_leader(p)) {
                p->signal->tty = current->signal->tty;
                p->signal->pgrp = process_group(current);
                p->signal->session = current->signal->session;
                attach_pid(p, PIDTYPE_PGID, process_group(p));
                attach_pid(p, PIDTYPE_SID, p->signal->session);
    
                list_add_tail_rcu(&p->tasks, &init_task.tasks);
                __get_cpu_var(process_counts)++;
            }
            attach_pid(p, PIDTYPE_PID, p->pid);
            nr_threads++;
        }
    
        total_forks++;
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        return p;
    
    bad_fork_cleanup_namespace:
        exit_namespace(p);
    bad_fork_cleanup_keys:
        exit_keys(p);
    bad_fork_cleanup_mm:
        if (p->mm)
            mmput(p->mm);
    bad_fork_cleanup_signal:
        cleanup_signal(p);
    bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
    bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
    bad_fork_cleanup_files:
        exit_files(p); /* blocking */
    bad_fork_cleanup_semundo:
        exit_sem(p);
    bad_fork_cleanup_audit:
        audit_free(p);
    bad_fork_cleanup_security:
        security_task_free(p);
    bad_fork_cleanup_policy:
    #ifdef CONFIG_NUMA
        mpol_free(p->mempolicy);
    bad_fork_cleanup_cpuset:
    #endif
        cpuset_exit(p);
    bad_fork_cleanup_delays_binfmt:
        delayacct_tsk_free(p);
        if (p->binfmt)
            module_put(p->binfmt->module);
    bad_fork_cleanup_put_domain:
        module_put(task_thread_info(p)->exec_domain->module);
    bad_fork_cleanup_count:
        put_group_info(p->group_info);
        atomic_dec(&p->user->processes);
        free_uid(p->user);
    bad_fork_free:
        free_task(p);
    fork_out:
        return ERR_PTR(retval);
    }
    copy_process

    函数执行步骤如下:

    1、首先判断clone_flags的各种位

    • CLONE_NEWNS和CLONE_FS标志不能同时被设置
    • CLONE_THREAD表示把子进程插入到父进程同一线程组中,CLONE_SIGHAND共享信号处理的表、阻塞信号的表、挂起信号的表。
    • 设置了CLONE_SIGHAND位必须设置CLONE_VM位,共享内存描述符和所有页表

    2、security_task_create是钩子函数

    3、dup_task_struct函数为子进程获取进程描述符

    static struct task_struct *dup_task_struct(struct task_struct *orig)
    {
        struct task_struct *tsk;
        struct thread_info *ti;
    
        prepare_to_copy(orig);
    
        tsk = alloc_task_struct();
        if (!tsk)
            return NULL;
    
        ti = alloc_thread_info(tsk);
        if (!ti) {
            free_task_struct(tsk);
            return NULL;
        }
    
        *tsk = *orig;
        tsk->thread_info = ti;
        setup_thread_stack(tsk, orig);
    
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
        atomic_set(&tsk->fs_excl, 0);
        tsk->btrace_seq = 0;
        tsk->splice_pipe = NULL;
        return tsk;
    }
    dup_task_struct

    dup_task_struct函数执行如下步骤:

    a、prepare_to_copy定义在archi386kernelprocess.c之中,直接调用unlazy_fpu,在includeasm-i386i387.h中,所做的工作是将FPU、MMX、SSESSE2寄存器的内容保存到父进程的thread_info之中。稍后会将他们复制到子进程之中。

    b、调用了alloc_task_struct复制了一个进程描述符,在fork.c之中

    # define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
    alloc_task_struct

    c、接着是includeasm-i386 hread_info.h中的alloc_thread_info,分配一个新的进程描述符。

    #define alloc_thread_info(tsk)                    
        ({                            
            struct thread_info *ret;            
                                    
            ret = kmalloc(THREAD_SIZE, GFP_KERNEL);        
            if (ret)                    
                memset(ret, 0, THREAD_SIZE);        
            ret;                        
        })
    #else
    alloc_thread_info

    c、接着是复制了整个进程描述符的内容。

    d、调用setup_thread_stack函数,在includelinuxsched.h之中,注意这个函数的参数,第一个是新建的进程描述符指针tsk,但是此时还是指向了原进程描述符的内容,只是tsk的thread_info结构重新建了一个。第二个参数是当前调用进程的进程描述符指针。

    static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
    {
        *task_thread_info(p) = *task_thread_info(org);
        task_thread_info(p)->task = p;
    }
    setup_thread_stack

    可以看到其实这里的thread_info结构其实也只是做了简单的数据内容复制。然后将thread_info中的task指针指向了新建的task_struct结构。

    e、uasge表示该进程描述符的使用次数,设为2,而且进程处于活动状态

    进程描述符的创建就结束了,下面回到copy_process函数中去。

    4、使用p->user->processes和p->signal->rlim[RLIMIT_NPROC].rlim_cur进行比较。signal->rlim数组已经描述过,表示资源限制,RLIMIT_NPROC表示最大进程数。user是定义在进程描述符同一文件中的user_struct结构,表示一个用户的多进程间共享的用户信息。这里比较用户当前拥有进程数是否大于最大进程数。至于这个capability又是另外一块了

    5、递增引用数和拥有进程数

    6、get_group_info就是递增了一下group_info的引用,应该是进程组共享的信息,在includelinuxsched.h之中

    7、拿nr_threads和max_threads来比较,看系统进程数目是否大于最大值,原则是所有进程的内核栈空间不能超过物理内存的1/8。

    8、exec_domin是执行域,和模块等相关概念可以深入理解一下。

    9、did_exec记录的是发出exec的次数,置为0。然后调用copy_flags更改了一些标志位。

    static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
    {
        unsigned long new_flags = p->flags;
    
        new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
        new_flags |= PF_FORKNOEXEC;
        if (!(clone_flags & CLONE_PTRACE))
            p->ptrace = 0;
        p->flags = new_flags;
    }
    copy_flags

    相关标志位在 includelinuxsched.h中,PF_SUPERPRIV表示使用了超级用户权限,PF_NOFREEZE表示进程不能被冻结,PF_FORKNOEXEC表示没有发出过exec系统调用。CLONE_PTRACE表示跟踪新建的子进程,并且进程描述符中的ptrace字段表示进程被跟踪。

    10、设置pid,设置了CLONE_PARENT_SETTID位的话,将子进程的pid写入到指针处(该地址是clone传入的参数,有do_fork传给copy_process,是父进程用户空间的一块地址)

    11、vfork_done是在vfork是时候使用的一个指针,在前面的do_fork有涉及到,是有关调度的。

    12、初始化进程描述符中的children和sibling指针,使用INIT_LIST_HEAD函数

    static inline void INIT_LIST_HEAD(struct list_head *list)
    {
        list->next = list;
        list->prev = list;
    }
    INIT_LIST_HEAD

    13、初始化allo_lock自旋锁、取消信号标志位、将私有信号队列初始化、初始化各种时间变量

    14、

    15、进程的线程组号设为进程pid号,这说明新建的进程而不是线程,如果是新建线程,则设置CLONE_THREAD位,tgid的值就应该设为父进程的pid号。设置了CLONE_THREAD位,就应该设置信号共享位,同时应该设置CLONE_VM位,共享内存描述符和所有页表。

    16、接着是调用了很多的copy函数,就是创建一下新的结构。

    17、看copy_thread函数,用clone传入的寄存器值和clone_flags去初始化子进程的内核栈。

    int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
        unsigned long unused,
        struct task_struct * p, struct pt_regs * regs)
    {
        struct pt_regs * childregs;
        struct task_struct *tsk;
        int err;
    
        childregs = task_pt_regs(p);
        *childregs = *regs;
        childregs->eax = 0;
        childregs->esp = esp;
    
        p->thread.esp = (unsigned long) childregs;
        p->thread.esp0 = (unsigned long) (childregs+1);
    
        p->thread.eip = (unsigned long) ret_from_fork;
    
        savesegment(fs,p->thread.fs);
        savesegment(gs,p->thread.gs);
    
        tsk = current;
        if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
            p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
            if (!p->thread.io_bitmap_ptr) {
                p->thread.io_bitmap_max = 0;
                return -ENOMEM;
            }
            memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
                IO_BITMAP_BYTES);
            set_tsk_thread_flag(p, TIF_IO_BITMAP);
        }
    
        /*
         * Set a new TLS for the child thread?
         */
        if (clone_flags & CLONE_SETTLS) {
            struct desc_struct *desc;
            struct user_desc info;
            int idx;
    
            err = -EFAULT;
            if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
                goto out;
            err = -EINVAL;
            if (LDT_empty(&info))
                goto out;
    
            idx = info.entry_number;
            if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
                goto out;
    
            desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
            desc->a = LDT_entry_a(&info);
            desc->b = LDT_entry_b(&info);
        }
    
        err = 0;
     out:
        if (err && p->thread.io_bitmap_ptr) {
            kfree(p->thread.io_bitmap_ptr);
            p->thread.io_bitmap_max = 0;
        }
        return err;
    }
    copy_thread

     a、调用了task_pt_regs函数,该函数返回了内核栈中保存的pt_regs地址。该函数定义在includeasm-i386process.h之中。

    /*
     * The below -8 is to reserve 8 bytes on top of the ring0 stack.
     * This is necessary to guarantee that the entire "struct pt_regs"
     * is accessable even if the CPU haven't stored the SS/ESP registers
     * on the stack (interrupt gate does not save these registers
     * when switching to the same priv ring).
     * Therefore beware: accessing the xss/esp fields of the
     * "struct pt_regs" is possible, but they may contain the
     * completely wrong values.
     */
    #define task_pt_regs(task)                                             
    ({                                                                     
           struct pt_regs *__regs__;                                       
           __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); 
           __regs__ - 1;                                                   
    })
    
    #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
    #define KSTK_TOP(info)                                                 
    ({                                                                     
           unsigned long *__ptr = (unsigned long *)(info);                 
           (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     
    })
    task_pt_regs

    其中KSTK_TOP就是把指针调整到栈底,这里用了数组的方式应该是为了来适配不同的long所占的不同字节数,这里THREAD_SIZE一般是两个页,有几个函数和宏定义在includelinuxsched.h,不多赘述了。

    这个地方需要特别注意的地方是减8这个操作,上面的代码中有一段注释,说是在中断门的时候系统不会自动保存ss和esp寄存器,为了能完整的访问整个结构体,所以预留下了8个字节的空位。看includeasm-i386ptrace.h中关于pt_regs的定义,发现ss和esp正好是最后两个,即靠近栈底的一边。

    struct pt_regs {
        long ebx;
        long ecx;
        long edx;
        long esi;
        long edi;
        long ebp;
        long eax;
        int  xds;
        int  xes;
        long orig_eax;
        long eip;
        int  xcs;
        long eflags;
        long esp;
        int  xss;
    };
    pt_regs

    b、复制整个寄存器的内容,但是eax的值置为0,这是为了让子进程返回的时候不返回pid的值,fork使用的时候就有这个规则。

    c、接着是更改了子进程的sp等一系列进程,其中子进程设置成了ret_from_fork函数,汇编函数定义在archi386kernelentry.S之中。

    d、下面的savesegment定义在includeasm-i386system.h中

    #define savesegment(seg, value) 
        asm volatile("mov %%" #seg ",%0":"=rm" (value))
    savesegment

    也就是保存段寄存器到thread对应位置中去。

    e、关于io位图的后面再看

    f、下面一段是创建TLS。

    copy_thread完了,回到copy_process函数

    18、CLONE_CHILD_SETTID和前面的CLONE_PARENT_SETTID类似,如果设置了还要在子进程的用户空间的某个地址写入pid,这个地址也是和parent一致,由clone调用传入。

    19、

    20、设置信号处理栈的地方,在vfork调用的时候是同时设置了CLONE_VFORK和CLONE_VM,这里要求只使用了CLONE_VM,还不明白有哪些情况。以及为什么。

    21、TIF_SYSCALL_TRACE这个位跟踪系统调用。

    22下面是初始化调度的部分。

    23、recalc_sigpending用来查看当前是否有信号,如果有,则此时还不能将新创建的进程加入到进程组,fork发出之前的信号不应该被传送给新的进程。

    24、后面实在看不下去了,字段都完全不明白,先留着,后面在再做打算。

    2.4 exit和exit_group系统调用退出进程

    先看exit的实现,sys_exit定义在kernelexit.c中

    asmlinkage long sys_exit(int error_code)
    {
        do_exit((error_code&0xff)<<8);
    }
    sys_exit

    调用了do_exit函数来执行,实际上所有进程终止都是使用do_exit函数,包括exit_group的实现。

    fastcall NORET_TYPE void do_exit(long code)
    {
        struct task_struct *tsk = current;
        struct taskstats *tidstats;
        int group_dead;
        unsigned int mycpu;
    
        profile_task_exit(tsk);
    
        WARN_ON(atomic_read(&tsk->fs_excl));
    
        if (unlikely(in_interrupt()))
            panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
            panic("Attempted to kill the idle task!");
        if (unlikely(tsk == child_reaper))
            panic("Attempted to kill init!");
    
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
            current->ptrace_message = code;
            ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
        }
    
        /*
         * We're taking recursive faults here in do_exit. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
            printk(KERN_ALERT
                "Fixing recursive fault but reboot is needed!
    ");
            if (tsk->io_context)
                exit_io_context();
            set_current_state(TASK_UNINTERRUPTIBLE);
            schedule();
        }
    
        tsk->flags |= PF_EXITING;
    
        if (unlikely(in_atomic()))
            printk(KERN_INFO "note: %s[%d] exited with preempt_count %d
    ",
                    current->comm, current->pid,
                    preempt_count());
    
        taskstats_exit_alloc(&tidstats, &mycpu);
    
        acct_update_integrals(tsk);
        if (tsk->mm) {
            update_hiwater_rss(tsk->mm);
            update_hiwater_vm(tsk->mm);
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
             hrtimer_cancel(&tsk->signal->real_timer);
            exit_itimers(tsk->signal);
        }
        acct_collect(code, group_dead);
        if (unlikely(tsk->robust_list))
            exit_robust_list(tsk);
    #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
        if (unlikely(tsk->compat_robust_list))
            compat_exit_robust_list(tsk);
    #endif
        if (unlikely(tsk->audit_context))
            audit_free(tsk);
        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
        taskstats_exit_free(tidstats);
    
        exit_mm(tsk);
    
        if (group_dead)
            acct_process();
        exit_sem(tsk);
        __exit_files(tsk);
        __exit_fs(tsk);
        exit_namespace(tsk);
        exit_thread();
        cpuset_exit(tsk);
        exit_keys(tsk);
    
        if (group_dead && tsk->signal->leader)
            disassociate_ctty(1);
    
        module_put(task_thread_info(tsk)->exec_domain->module);
        if (tsk->binfmt)
            module_put(tsk->binfmt->module);
    
        tsk->exit_code = code;
        proc_exit_connector(tsk);
        exit_notify(tsk);
    #ifdef CONFIG_NUMA
        mpol_free(tsk->mempolicy);
        tsk->mempolicy = NULL;
    #endif
        /*
         * This must happen late, after the PID is not
         * hashed anymore:
         */
        if (unlikely(!list_empty(&tsk->pi_state_list)))
            exit_pi_state_list(tsk);
        if (unlikely(current->pi_state_cache))
            kfree(current->pi_state_cache);
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held(tsk);
    
        if (tsk->io_context)
            exit_io_context();
    
        if (tsk->splice_pipe)
            __free_pipe_info(tsk->splice_pipe);
    
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);
        tsk->flags |= PF_DEAD;
    
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
        for (;;) ;
    }
    do_exit
  • 相关阅读:
    【Vue】状态管理
    【Vue】路由
    【Vue】组件
    【Vue】基础(数据 & 计算属性 & 方法)
    【Vue】基础(虚拟DOM & 响应式原理)
    【Vue】基础(生命周期 & 常用指令)
    【Vue】搭建开发环境
    【Mongodb】事务
    【Mongodb】视图 && 索引
    【Mongodb】聚合查询 && 固定集合
  • 原文地址:https://www.cnblogs.com/likaiming/p/8659824.html
Copyright © 2011-2022 走看看