zoukankan      html  css  js  c++  java
  • 深入理解Linux之进程初探

    一. 关于fork调用

      fork()调用创建一个新的进程,该进程几乎是当前进程的一个完全拷贝。由fork()创建的新进程被称为子进程。fork函数被调用一次但返回两次。两次返回的唯一区别是子进程中返回0值,而父进程中返回子进程ID。子进程是父进程的副本,它将获得父进程数据空间、堆、栈等资源的副本。注意,子进程持有的是上述存储空间的“副本”,这意味着父子进程间不共享这些存储空间。Linux将复制父进程的地址空间内容给子进程,因此,子进程拥有独立的地址空间。

      我们来看一个DEMO:

    // fork_example.c
    #include <memory.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/types.h>
    #include <unistd.h>
    
    int main(int argc, const char *argv[])
    {
        pid_t pid;
        
        char stack_data[] = "stack_data";
    
        char *heap_data = malloc(10 * sizeof(char));    
        strcpy(heap_data, "heap_data");
        
        pid = fork();
        if (pid == 0) {
            printf("CHILD PROCESS: %s, %s\n", stack_data, heap_data);
        } else if (pid > 0) {
            printf("PARENT PROCESS: %s, %s\n", stack_data, heap_data);
        } else {
            printf("FORK FAILED.");
        }
    
        return 0;
    }

      运行的输出结果为:

    CHILD PROCESS: stack_data, heap_data
    PARENT PROCESS: stack_data, heap_data

      可以看出,父进程和子进程的栈和堆的数据是相同的。这些数据在创建子进程时是通过拷贝产生的。

    二. 关于execl调用

      系统调用exec是以新的进程去代替原来的进程,但进程的PID保持不变。因此,可以这样认为,exec系统调用并没有创建新的进程,只是替换了原来进程上下文的内容。原进程的代码段,数据段,堆栈段被新的进程所代替。

      我们来看一个例子:

    // execl_example.c
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    
    int main(int argc, const char *argv[])
    {
        execl("./hello_world", NULL, NULL);
        
        /* We can only reach this code when there is an error in execl */    
        printf("The execl must be failed!\n");
    
        return 1;
    }

      我们执行一个不存在的hello_world程序,看看输出结果:

    The execl must be failed!

      现在我们创建一个hello_world程序,该程序简单的打印一个Hello World.

    // hello_world.c
    #include <stdio.h>
    
    int main(int argc, const char *argv[])
    {
        printf("Hello World!\n");
    }

      现在我们继续运行execl_example程序,这时输出为:

    Hello World!

      通过比较两次输出,我们发现:当execl成功时,原有的进程执行就会被打断,替换为新的进程继续执行。

    三. 使用汇编进行系统调用

      我们知道在Linux中,每个系统调用都对应一个系统调用号。这个系统调用号是在unistd.h中定义的。在我的机器上文件的位置是在:

    /usr/src/linux-headers-2.6.28-11-generic/arch/x86/include/asm/unistd_32.h

      如果找不到,可以尝试使用以下命令查找:

    locate unistd.h | xargs grep -ri "__NR_fork"

      下面是unistd.h的部分内容:

    ... ...
    #define __NR_restart_syscall      0
    #define __NR_exit          1
    #define __NR_fork          2
    #define __NR_read          3
    #define __NR_write          4
    #define __NR_open          5
    #define __NR_close          6
    #define __NR_waitpid          7
    #define __NR_creat          8
    #define __NR_link          9
    #define __NR_unlink         10
    #define __NR_execve         11
    ... ...

      使用汇编调用fork:

      可以看到fork的系统调用号是2,我们现在使用汇编代码重新编写fork_example.c

    #include <memory.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/types.h>
    #include <unistd.h>
    
    int  main()
    {
        pid_t pid;
        
        char stack_data[] = "stack_data";
    
        char *heap_data = malloc(10 * sizeof(char));    
        strcpy(heap_data, "heap_data");
    
        // pid = fork();
        asm volatile(
            "mov $0x2, %%eax\n\t" // 将fork的系统调用号2存到eax寄存器  
            "int $0x80\n\t"       // 产生int 0x80中断
            "mov %%eax,%0\n\t"    // 将结果存入pid中
            : "=m" (pid) 
        );
        
        if (pid == 0) {
            printf("CHILD PROCESS: %s, %s\n", stack_data, heap_data);
        } else if (pid > 0) {
            printf("PARENT PROCESS: %s, %s\n", stack_data, heap_data);
        } else {
            printf("FORK FAILED.\n");
        }
    
        return 0;
    }

      运行输出结果是:

    CHILD PROCESS: stack_data, heap_data
    PARENT PROCESS: stack_data, heap_data

      可以尝试将调用号替换一下,改成$0x3,得到的结果是:

    FORK FAILED.

      使用汇编调用execl:

      我们再尝试一下使用汇编调用execl。通过上面的观察我们可以看到execl的系统调用号是11.

    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    
    int main(int argc, const char *argv[])
    {
        // execl("./hello_world", NULL, NULL);
        const char *program = "./hello_world";
        asm volatile (
            "mov %0,%%ebx\n\t"   // 使用program做为参数1
            "mov $0,%%ecx\n\t"   // 参数2为NULL
            "mov $0,%%edx\n\t"   // 参数3为NULL
            "mov $0xb,%%eax\n\t" // 将execl的系统调用好11存入eax中
            "int $0x80\n\t"      // 产生0x80中断
            : "=m" (program)
        );
        
        /* We can only reach this code when there is an error in execl */    
        printf("The execl must be failed!\n");
    
        return 1;
    }

      运行结果为:

    Hello World!

      如果将系统调用号改为0x3,输出结果为:

    The execl must be failed!

    四.系统调用过程详解

      通过第三步的过程,我们了解到,系统调用在内核中的执行是依靠中断实现的。如果我们想进一步定位fork和execl的代码,我们需要先了解系统调用的详细过程。即回答以下两个问题:

      1.中断是怎么工作的?

      2.int 0x80中断是怎么工作的?

      中断是怎么工作的

      在Linux操作系统中,中断是通过中断描述符表工作的。中断描述符表(Interrupt Descriptor Table, IDT)是一个系统表,它与每一个中断或者异常向量相联系,每一个向量在表中有相应的中断或者异常处理程序的入口地址。内核在允许中断发生前,必须适当的初始化IDT。对于每个中断,都会有对应的中断处理程序。当产生一个中断时,Linux根据中断向量表中对应的项找到存储中断处理程序的地址,然后调用相应的中断处理程序。中段描述符表在内存中的地址存储在idtr寄存器中。内核在启动中断前,必须初始化IDT,然后将IDT的地址壮载到idtr中。

      内核初始化的时候调用trap_init()函数和init_IRQ()函数初始化中断向量表。

      int 0x80中断是怎么工作的

      通过上面的分析,我们知道每个中断都有对应的处理程序。在系统调用的过程中,会有一个系统调用分派表,每个表项存储了一个系统调用。系统调用中断处理程序,根据系统调用号找到对应的系统调用执行。对于系统调用,参数的传递是通过寄存器ebx ecx edx进行传递的。eax中存储的是系统调用号。系统调用最大为__NR_syscalls个。

      

      在arch/x86/include/asm/irq_vectors.h中定义了

    # define SYSCALL_VECTOR            0x80

      现在我们查找trap_init函数,在arch/x86/kernel/traps.c中

    set_system_trap_gate(SYSCALL_VECTOR, &system_call);

      现在,查找system_call函数,在arch/x86/kernel/entry_32.s中:

    ENTRY(system_call)
        RING0_INT_FRAME            # can't unwind into user space anyway
        ASM_CLAC
        pushl_cfi %eax            # save orig_eax
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
                        # system call tracing in operation / emulation
        testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(NR_syscalls), %eax
        jae syscall_badsys
    syscall_call:
        call *sys_call_table(,%eax,4)
        movl %eax,PT_EAX(%esp)        # store the return value
    syscall_exit:
        LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                        # setting need_resched or sigpending
                        # between sampling and the iret
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        testl $_TIF_ALLWORK_MASK, %ecx    # current->work
        jne syscall_exit_work

      在include/uapi/asm_generic/unistd.h中找到:

    __SYSCALL(__NR_fork, sys_fork)

      fork的系统调用号是2,对应的系统调用分派表中为sys_fork函数。在kernel/fork.c中找到如下代码:

    #ifdef __ARCH_WANT_SYS_FORK
    SYSCALL_DEFINE0(fork)
    {
    #ifdef CONFIG_MMU
        return do_fork(SIGCHLD, 0, 0, NULL, NULL);
    #else
        /* can not support in nommu mode */
        return(-EINVAL);
    #endif
    }
    #endif

    四.do_fork源码分析

      现在查找do_fork函数,也在kernel/fork.c中:

    /*
     *  Ok, 这就是fork例程的主要部分。
     *
     * 函数执行进程的复制,如果成功则启动新进程。并且等待新进程完成VM的使用。
     */
    long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
    {
        struct task_struct *p;
        int trace = 0;
        long nr;
    
        /*
         * 在分配之前做一些参数和权限检查。
         */
        if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
            if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
                return -EINVAL;
        }
    
        /*
         * 确定是否需要报告给ptracer,或者哪些需要汇报给ptracer。如果是调用者内核线程
         * 或者标志了CLONE_UNTRACED,则不报告任何跟踪信息。否则,报告相应fork的跟踪信息。
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
            if (clone_flags & CLONE_VFORK)
                trace = PTRACE_EVENT_VFORK;
            else if ((clone_flags & CSIGNAL) != SIGCHLD)
                trace = PTRACE_EVENT_CLONE;
            else
                trace = PTRACE_EVENT_FORK;
    
            if (likely(!ptrace_event_enabled(current, trace)))
                trace = 0;
        }
      
        // copy_process函数创建进程描述符和子进程需要的其他数据结构。
        p = copy_process(clone_flags, stack_start, stack_size,
                 child_tidptr, NULL, trace);
                 
        /* 现在唤醒新线程。*/
        if (!IS_ERR(p)) {
            struct completion vfork;
    
            trace_sched_process_fork(current, p);
    
            nr = task_pid_vnr(p);
    
            if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, parent_tidptr);
    
            if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
            }
    
            wake_up_new_task(p);
    
            /* fork已经完成,子进程也已经启动。现在通知ptracer。 */
            if (unlikely(trace))
                ptrace_event(trace, nr);
    
            if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                    ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
            }
        } else {
            nr = PTR_ERR(p);
        }
        return nr;
    }

      可以看到do_fork调用了copy_process完成了绝大部分的工作。copy_process位于同一个文件当中:

    /*
     * 以复制的方式创建一个新的进程。但不启动运行新创建的进程。
     *
     * 主要复制寄存器和其它进程环境中的相应的合适部分。真正的
     * 启动工作则交由调用者完成。
     */
    static struct task_struct *copy_process(unsigned long clone_flags,
                        unsigned long stack_start,
                        unsigned long stack_size,
                        int __user *child_tidptr,
                        struct pid *pid,
                        int trace)
    {
        int retval;
        struct task_struct *p; // 保存新的进程描述符。
        
        /* 删除了对标志位的一致性和合法性的检查 */
        
        // security_task_create和security_task_alloc()执行所有附加的安全检查。
        retval = security_task_create(clone_flags);
        // dup_task_struct为子进程获取进程描述符。稍后分析。
        p = dup_task_struct(current);
        // task结构中ftrace_ret_stack结构变量的初始化,即函数返回用的栈。
        ftrace_graph_init_task(p);
        get_seccomp_filter(p);
        // task中互斥变量的初始化。
        rt_mutex_init_task(p);
        // 第1个if对进程占用的资源数做出限制,task_rlimit(p, RLIMIT_NPROC)
        // 限制了改进程用户可以拥有的进程总数。 
        if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) {
            // 第2个if使用了capable()函数来对权限做出检查,检查是否有权对指定
            // 的资源进行操作,该函数返回0则代表无权操作。
            if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER)
                goto bad_fork_free;
        }
        
        current->flags &= ~PF_NPROC_EXCEEDED; // 将当前进程标志位中的PF_NPROC_EXCEEDED置0。
        copy_creds(p, clone_flags); // copy_creds()复制证书,应该是复制权限及身份信息。
    
        // 检查创建的线程是否超过了系统进程总量。
        if (nr_threads >= max_threads)
            goto bad_fork_cleanup_count;
        
        // 增加执行实体的模块引用计数。
        if (!try_module_get(task_thread_info(p)->exec_domain->module))
            goto bad_fork_cleanup_count;
    
        p->did_exec = 0;
        delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p); // 更新task_struct结构中flags成员
        INIT_LIST_HEAD(&p->children); // 初始化task_struct结构中的子进程链表
        INIT_LIST_HEAD(&p->sibling); // 初始化task_struct结构中的兄弟进程链表
        rcu_copy_process(p); // rcu相关变量的初始化
        p->vfork_done = NULL; 
        spin_lock_init(&p->alloc_lock); 
    
        init_sigpending(&p->pending);
    
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
        p->prev_cputime.utime = p->prev_cputime.stime = 0;
        seqlock_init(&p->vtime_seqlock);
        p->vtime_snap = 0;
        p->vtime_snap_whence = VTIME_SLEEPING;
    
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
    
        p->default_timer_slack_ns = current->timer_slack_ns;
    
        task_io_accounting_init(&p->ioac); // 进程描述符中的io数据记录的初始化
        acct_clear_integrals(p);
    
        posix_cpu_timers_init(p); // timer初始化
    
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->real_start_time = p->start_time;
        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
            threadgroup_change_begin(current);
        cgroup_fork(p);
    #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
            retval = PTR_ERR(p->mempolicy);
            p->mempolicy = NULL;
            goto bad_fork_cleanup_cgroup;
        }
        mpol_fix_fork_child_flag(p);
    #endif
        /* 设置CPU */
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
        seqcount_init(&p->mems_allowed_seq);
        /* 设置跟踪中断标志 */ 
        p->irq_events = 0;
        p->hardirqs_enabled = 0;
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
        p->hardirq_disable_event = 0;
        p->softirqs_enabled = 1;
        p->softirq_enable_ip = _THIS_IP_;
        p->softirq_enable_event = 0;
        p->softirq_disable_ip = 0;
        p->softirq_disable_event = 0;
        p->hardirq_context = 0;
        p->softirq_context = 0;
        /* 设置锁深度 */
        p->lockdep_depth = 0; /* no locks held yet */
        p->curr_chain_key = 0;
        p->lockdep_recursion = 0;
    
    #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
    #endif
    #ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
    #endif
    
        sched_fork(p); // 调度相关初始化,将新进程分配到某个CPU上。
    
        perf_event_init_task(p);
        audit_alloc(p);
            
        /* 以下根据clone_flags的设置复制相应的部分,进行重新分配或者共享父进程的内容 */
        copy_semundo(clone_flags, p);
        copy_files(clone_flags, p);
        copy_fs(clone_flags, p);
        copy_sighand(clone_flags, p);
        copy_signal(clone_flags, p);
        copy_mm(clone_flags, p);
        copy_namespaces(clone_flags, p);
        copy_io(clone_flags, p);
        copy_thread(clone_flags, stack_start, stack_size, p);
    
        if (pid != &init_struct_pid) {
            retval = -ENOMEM;
            pid = alloc_pid(p->nsproxy->pid_ns);
            if (!pid)
                goto bad_fork_cleanup_io;
        }
    
        p->pid = pid_nr(pid);
        p->tgid = p->pid;
        // 如果设置了同在一个线程组则继承TGID。 
        // 对于普通进程来说TGID和PID相等, 
        // 对于线程来说,同一线程组内的所有线程的TGID都相等, 
        // 这使得这些多线程可以通过调用getpid()获得相同的PID。
        if (clone_flags & CLONE_THREAD)
            p->tgid = current->tgid;
    
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
            p->sas_ss_sp = p->sas_ss_size = 0;
    
        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
    #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
    #endif
        clear_all_latency_tracing(p);
    
        /* ok, now we should be set up.. */
        if (clone_flags & CLONE_THREAD)
            p->exit_signal = -1;
        else if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
    
        p->pdeath_signal = 0;
        p->exit_state = 0;
    
        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;
    
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
    
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
    
        // 如果这两个标志设定了,那么和父进程有相同的父进程
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
            p->real_parent = current->real_parent;
            p->parent_exec_id = current->parent_exec_id;
        } else { 
        // 否则父进程为实际父进程
            p->real_parent = current;
            p->parent_exec_id = current->self_exec_id;
        }
    
        spin_lock(&current->sighand->siglock);
    
        /*
         * Process group and session signals need to be delivered to just the
         * parent before the fork or both the parent and the child after the
         * fork. Restart if a signal comes in before we add the new process to
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
        */
        recalc_sigpending();
        if (signal_pending(current)) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -ERESTARTNOINTR;
            goto bad_fork_free_pid;
        }
        
        // 如果和父进程有相同的线程组
        if (clone_flags & CLONE_THREAD) {
            current->signal->nr_threads++;
            atomic_inc(&current->signal->live);
            atomic_inc(&current->signal->sigcnt);
            p->group_leader = current->group_leader;
            list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
        }
    
        if (likely(p->pid)) {
            ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); // ptrace的相关初始化
            
            // 如果进程p是线程组leader
            if (thread_group_leader(p)) {
                if (is_child_reaper(pid)) {
                    ns_of_pid(pid)->child_reaper = p;
                    p->signal->flags |= SIGNAL_UNKILLABLE;
                }
    
                p->signal->leader_pid = pid;
                p->signal->tty = tty_kref_get(current->signal->tty);
                
                /* 加入对应的PID哈希表 */
                attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                attach_pid(p, PIDTYPE_SID, task_session(current));
                
                list_add_tail(&p->sibling, &p->real_parent->children);
                list_add_tail_rcu(&p->tasks, &init_task.tasks); // 加入队列
                __this_cpu_inc(process_counts); // 将per cpu变量加一
            }
            attach_pid(p, PIDTYPE_PID, pid); // 维护pid变量
            nr_threads++; // 线程数加1。
        }
    
        total_forks++; // 将全局变量total_forks加1.
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
            threadgroup_change_end(current);
        perf_event_fork(p);
    
        trace_task_newtask(p, clone_flags);
    
        return p;
    }

    dup_task_struct也在fork.c文件中

    static struct task_struct *dup_task_struct(struct task_struct *orig)
    {
        struct task_struct *tsk; // 存放新的task_sturct结构体
        struct thread_info *ti; // 存放线程信息
        unsigned long *stackend; 
        int node = tsk_fork_get_node(orig); 
        int err;
    
        tsk = alloc_task_struct_node(node); // 通过alloc_task_struct()函数创建task_struct结构空间
    
        ti = alloc_thread_info_node(tsk, node); // 分配thread_info结构空间
    
        err = arch_dup_task_struct(tsk, orig); // 关于浮点结构的复制
        
        tsk->stack = ti; // task的对应栈
    
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
    
    #ifdef CONFIG_CC_STACKPROTECTOR
        tsk->stack_canary = get_random_int(); // 金丝雀的设置,用于防御栈溢出攻击
    #endif
    
        /*
         * One for us, one for whoever does the "release_task()" (usually
         * parent)
         */
        atomic_set(&tsk->usage, 2); // 设置进程块的使用计数。
    #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
    #endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
    
        account_kernel_stack(ti, 1);
    
        return tsk;
    }

      通过上面的代码,可以总结出fork的工作的基本流程是:

    五.do_execve的分析

    execve对应的内核服务例程位于fs/exec.c中。

    /*
     * sys_execve() 服务例程执行一个程序.
     * filename需要执行的文件的绝对路径
     * argv传入系统调用的参数
     * regs是系统调用时系统堆栈的情况
     */
    static int do_execve_common(const char *filename,
                    struct user_arg_ptr argv,
                    struct user_arg_ptr envp)
    {
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
        const struct cred *cred = current_cred(); 
    
        unshare_files(&displaced); 
        // 动态分配一个linux_binprm数据结构,并用新的可执行文件的数据填充这个结构
        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 
    
        retval = prepare_bprm_creds(bprm);
    
        retval = check_unsafe_exec(bprm); 
        clear_in_exec = retval;
        current->in_execve = 1;
    
        file = open_exec(filename); // 打开可执行文件并读入到内存。
        retval = PTR_ERR(file);
    
        sched_exec(); // 确定最小负载的CPU以执行新程序,并把当前进程转移过去。
    
        bprm->file = file;
        bprm->filename = filename;
        bprm->interp = filename;
    
        bprm_mm_init(bprm);
    
        bprm->argc = count(argv, MAX_ARG_STRINGS);
    
        bprm->envc = count(envp, MAX_ARG_STRINGS);
        
        // prepare_binprm()填充linux_binprm数据结构,这个函数依次执行:
        // a.检查文件是否可执行。
        // b.初始化bprm的e_uid和e_gid字段。
        // c.用可执行文件的前128个字节填充bprm的buf字段。
        prepare_binprm(bprm); 
        
        /* 把文件路径名拷贝、命令行参数及环境串拷贝到一个或多个新分配的页框中 */
        copy_strings_kernel(1, &bprm->filename, bprm);
        bprm->exec = bprm->p;
        copy_strings(bprm->envc, envp, bprm);
        copy_strings(bprm->argc, argv, bprm);
        
        // 扫描formats链表,并尽力应用每个元素的load_binary方法,把bprm传递给这个
        // 函数。只要load_binary方法成功应答了文件的可执行格式,对formats扫描终止。
        search_binary_handler(bprm);
    
        /* 成功,释放bprm,返回从该文件可执行格式的load_binary方法中所获得的代码。 */
        current->fs->in_exec = 0;
        current->in_execve = 0;
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
            put_files_struct(displaced);
        return retval;
    }

    下面我们看看load_elf_binary函数,该函数位于fs/binfmt_elf.c中

    static int load_elf_binary(struct linux_binprm *bprm)
    {
        struct file *interpreter = NULL; /* to shut gcc up */
         unsigned long load_addr = 0, load_bias = 0;
        int load_addr_set = 0;
        char * elf_interpreter = NULL;
        unsigned long error;
        struct elf_phdr *elf_ppnt, *elf_phdata;
        unsigned long elf_bss, elf_brk;
        int retval, i;
        unsigned int size;
        unsigned long elf_entry;
        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        unsigned long def_flags = 0;
        struct pt_regs *regs = current_pt_regs();
        struct {
            struct elfhdr elf_ex;
            struct elfhdr interp_elf_ex;
        } *loc;
    
        loc = kmalloc(sizeof(*loc), GFP_KERNEL);
        
        /* 读取可执行文件的首部。首部描述程序的段和所需的共享库。 */
        loc->elf_ex = *((struct elfhdr *)bprm->buf);
    
        /* 检测一致性 */
        if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
            goto out;
    
        if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
            goto out;
        if (!elf_check_arch(&loc->elf_ex))
            goto out;
        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
            goto out;
    
        /* 读取所有的首部信息 */
        loc->elf_ex.e_phentsize != sizeof(struct elf_phdr);
        if (loc->elf_ex.e_phnum < 1 || loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
            goto out;
        size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
        retval = -ENOMEM;
        elf_phdata = kmalloc(size, GFP_KERNEL);
    
        retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, (char *)elf_phdata, size);
        if (retval != size) {
            if (retval >= 0)
                retval = -EIO;
            goto out_free_ph;
        }
    
        elf_ppnt = elf_phdata;
        elf_bss = 0;
        elf_brk = 0;
    
        start_code = ~0UL;
        end_code = 0;
        start_data = 0;
        end_data = 0;
    
        for (i = 0; i < loc->elf_ex.e_phnum; i++) {
            if (elf_ppnt->p_type == PT_INTERP) {
                /* This is the program interpreter used for
                 * shared libraries - for now assume that this
                 * is an a.out format binary
                 */
                retval = -ENOEXEC;
                if (elf_ppnt->p_filesz > PATH_MAX || 
                    elf_ppnt->p_filesz < 2)
                    goto out_free_ph;
    
                retval = -ENOMEM;
                elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                              GFP_KERNEL);
                if (!elf_interpreter)
                    goto out_free_ph;
    
                retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                             elf_interpreter,
                             elf_ppnt->p_filesz);
                if (retval != elf_ppnt->p_filesz) {
                    if (retval >= 0)
                        retval = -EIO;
                    goto out_free_interp;
                }
                /* make sure path is NULL terminated */
                retval = -ENOEXEC;
                if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                    goto out_free_interp;
    
                interpreter = open_exec(elf_interpreter);
                retval = PTR_ERR(interpreter);
                if (IS_ERR(interpreter))
                    goto out_free_interp;
    
                /*
                 * If the binary is not readable then enforce
                 * mm->dumpable = 0 regardless of the interpreter's
                 * permissions.
                 */
                would_dump(bprm, interpreter);
    
                retval = kernel_read(interpreter, 0, bprm->buf,
                             BINPRM_BUF_SIZE);
                if (retval != BINPRM_BUF_SIZE) {
                    if (retval >= 0)
                        retval = -EIO;
                    goto out_free_dentry;
                }
    
                /* Get the exec headers */
                loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
                break;
            }
            elf_ppnt++;
        }
    
        elf_ppnt = elf_phdata;
        for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
            if (elf_ppnt->p_type == PT_GNU_STACK) {
                if (elf_ppnt->p_flags & PF_X)
                    executable_stack = EXSTACK_ENABLE_X;
                else
                    executable_stack = EXSTACK_DISABLE_X;
                break;
            }
    
        /* Some simple consistency checks for the interpreter */
        if (elf_interpreter) {
            retval = -ELIBBAD;
            /* Not an ELF interpreter */
            if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
                goto out_free_dentry;
            /* Verify the interpreter has a valid arch */
            if (!elf_check_arch(&loc->interp_elf_ex))
                goto out_free_dentry;
        }
    
        // 释放前一个计算所占用的几乎所有资源
        retval = flush_old_exec(bprm);
    
        /* OK, This is the point of no return */
        current->mm->def_flags = def_flags;
    
        /* Do this immediately, since STACK_TOP as used in setup_arg_pages
           may depend on the personality.  */
        SET_PERSONALITY(loc->elf_ex);
        if (elf_read_implies_exec(loc->elf_ex, executable_stack))
            current->personality |= READ_IMPLIES_EXEC;
    
        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
            current->flags |= PF_RANDOMIZE;
    
        setup_new_exec(bprm);
    
        /* Do this so that we can load the interpreter, if need be.  We will
           change some of these later */
        current->mm->free_area_cache = current->mm->mmap_base;
        current->mm->cached_hole_size = 0;
        // 为进程的用户态堆栈分配一个新的线性区描述符,并把那个线性区插入到进程的地址空间。
        setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack);
        
        
        current->mm->start_stack = bprm->p;
    
        /* 现在将ELF镜像文件映射到内存中正确的位置 */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
            int elf_prot = 0, elf_flags;
            unsigned long k, vaddr;
    
            if (elf_ppnt->p_type != PT_LOAD)
                continue;
    
            if (unlikely (elf_brk > elf_bss)) {
                unsigned long nbyte;
                    
                /* There was a PT_LOAD segment with p_memsz > p_filesz
                   before this one. Map anonymous pages, if needed,
                   and clear the area.  */
                retval = set_brk(elf_bss + load_bias,
                         elf_brk + load_bias);
                if (retval) {
                    send_sig(SIGKILL, current, 0);
                    goto out_free_dentry;
                }
                nbyte = ELF_PAGEOFFSET(elf_bss);
                if (nbyte) {
                    nbyte = ELF_MIN_ALIGN - nbyte;
                    if (nbyte > elf_brk - elf_bss)
                        nbyte = elf_brk - elf_bss;
                    if (clear_user((void __user *)elf_bss +
                                load_bias, nbyte)) {
                        /*
                         * This bss-zeroing can fail if the ELF
                         * file specifies odd protections. So
                         * we don't check the return value
                         */
                    }
                }
            }
    
            if (elf_ppnt->p_flags & PF_R)
                elf_prot |= PROT_READ;
            if (elf_ppnt->p_flags & PF_W)
                elf_prot |= PROT_WRITE;
            if (elf_ppnt->p_flags & PF_X)
                elf_prot |= PROT_EXEC;
    
            elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
    
            vaddr = elf_ppnt->p_vaddr;
            if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
                elf_flags |= MAP_FIXED;
            } else if (loc->elf_ex.e_type == ET_DYN) {
                /* Try and get dynamic programs out of the way of the
                 * default mmap base, as well as whatever program they
                 * might try to exec.  This is because the brk will
                 * follow the loader, and is not movable.  */
    #ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
                /* Memory randomization might have been switched off
                 * in runtime via sysctl.
                 * If that is the case, retain the original non-zero
                 * load_bias value in order to establish proper
                 * non-randomized mappings.
                 */
                if (current->flags & PF_RANDOMIZE)
                    load_bias = 0;
                else
                    load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
    #else
                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
    #endif
            }
    
            error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
                    elf_prot, elf_flags, 0);
            if (BAD_ADDR(error)) {
                send_sig(SIGKILL, current, 0);
                retval = IS_ERR((void *)error) ?
                    PTR_ERR((void*)error) : -EINVAL;
                goto out_free_dentry;
            }
    
            if (!load_addr_set) {
                load_addr_set = 1;
                load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
                if (loc->elf_ex.e_type == ET_DYN) {
                    load_bias += error -
                                 ELF_PAGESTART(load_bias + vaddr);
                    load_addr += load_bias;
                    reloc_func_desc = load_bias;
                }
            }
            k = elf_ppnt->p_vaddr;
            if (k < start_code)
                start_code = k;
            if (start_data < k)
                start_data = k;
    
            /*
             * Check to see if the section's size will overflow the
             * allowed task size. Note that p_filesz must always be
             * <= p_memsz so it is only necessary to check p_memsz.
             */
            if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
                elf_ppnt->p_memsz > TASK_SIZE ||
                TASK_SIZE - elf_ppnt->p_memsz < k) {
                /* set_brk can never work. Avoid overflows. */
                send_sig(SIGKILL, current, 0);
                retval = -EINVAL;
                goto out_free_dentry;
            }
    
            k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
    
            if (k > elf_bss)
                elf_bss = k;
            if ((elf_ppnt->p_flags & PF_X) && end_code < k)
                end_code = k;
            if (end_data < k)
                end_data = k;
            k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
            if (k > elf_brk)
                elf_brk = k;
        }
    
        loc->elf_ex.e_entry += load_bias;
        elf_bss += load_bias;
        elf_brk += load_bias;
        start_code += load_bias;
        end_code += load_bias;
        start_data += load_bias;
        end_data += load_bias;
    
        /* Calling set_brk effectively mmaps the pages that we need
         * for the bss and break sections.  We must do this before
         * mapping in the interpreter, to make sure it doesn't wind
         * up getting placed where the bss needs to go.
         */
        retval = set_brk(elf_bss, elf_brk);
        if (retval) {
            send_sig(SIGKILL, current, 0);
            goto out_free_dentry;
        }
        if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
            send_sig(SIGSEGV, current, 0);
            retval = -EFAULT; /* Nobody gets to see this, but.. */
            goto out_free_dentry;
        }
        // 调用一个动态链接程序的函数。如果动态链接程序是elf可执行的,这
        // 个函数就叫做load_elf_interp()。
        if (elf_interpreter) {
            unsigned long interp_map_addr = 0;
            
            elf_entry = load_elf_interp(&loc->interp_elf_ex,
                            interpreter,
                            &interp_map_addr,
                            load_bias);
            if (!IS_ERR((void *)elf_entry)) {
                /*
                 * load_elf_interp() returns relocation
                 * adjustment
                 */
                interp_load_addr = elf_entry;
                elf_entry += loc->interp_elf_ex.e_entry;
            }
            if (BAD_ADDR(elf_entry)) {
                force_sig(SIGSEGV, current);
                retval = IS_ERR((void *)elf_entry) ?
                        (int)elf_entry : -EINVAL;
                goto out_free_dentry;
            }
            reloc_func_desc = interp_load_addr;
    
            allow_write_access(interpreter);
            fput(interpreter);
            kfree(elf_interpreter);
        } else {
            elf_entry = loc->elf_ex.e_entry;
            if (BAD_ADDR(elf_entry)) {
                force_sig(SIGSEGV, current);
                retval = -EINVAL;
                goto out_free_dentry;
            }
        }
    
        kfree(elf_phdata);
        // 把可执行格式的linux_binfmt对象的地址存放在进程描述符的binfmt字段中。
        set_binfmt(&elf_format);
    
    #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
        retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
        if (retval < 0) {
            send_sig(SIGKILL, current, 0);
            goto out;
        }
    #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
    
        install_exec_creds(bprm);
        retval = create_elf_tables(bprm, &loc->elf_ex,
                  load_addr, interp_load_addr);
        if (retval < 0) {
            send_sig(SIGKILL, current, 0);
            goto out;
        }
        /* N.B. passed_fileno might not be initialized? */
        current->mm->end_code = end_code;
        current->mm->start_code = start_code;
        current->mm->start_data = start_data;
        current->mm->end_data = end_data;
        current->mm->start_stack = bprm->p;
    
    #ifdef arch_randomize_brk
        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
            current->mm->brk = current->mm->start_brk =
                arch_randomize_brk(current->mm);
    #ifdef CONFIG_COMPAT_BRK
            current->brk_randomized = 1;
    #endif
        }
    #endif
    
        if (current->personality & MMAP_PAGE_ZERO) {
            /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
               and some applications "depend" upon this behavior.
               Since we do not have the power to recompile these, we
               emulate the SVr4 behavior. Sigh. */
            error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                    MAP_FIXED | MAP_PRIVATE, 0);
        }
    
    #ifdef ELF_PLAT_INIT
        /*
         * The ABI may specify that certain registers be set up in special
         * ways (on i386 %edx is the address of a DT_FINI function, for
         * example.  In addition, it may also specify (eg, PowerPC64 ELF)
         * that the e_entry field is the address of the function descriptor
         * for the startup routine, rather than the address of the startup
         * routine itself.  This macro performs whatever initialization to
         * the regs structure is required as well as any relocations to the
         * function descriptor entries when executing dynamically links apps.
         */
        ELF_PLAT_INIT(regs, reloc_func_desc);
    #endif
    
        start_thread(regs, elf_entry, bprm->p);
        retval = 0;
    out:
        kfree(loc);
    out_ret:
        return retval;
    
        /* error cleanup */
    out_free_dentry:
        allow_write_access(interpreter);
        if (interpreter)
            fput(interpreter);
    out_free_interp:
        kfree(elf_interpreter);
    out_free_ph:
        kfree(elf_phdata);
        goto out;
    }

      

  • 相关阅读:
    Map的两张遍历方法 keySet(),entrySet()
    http://localhost:8080请求用户名和密码。信息为:“XDB” 解决办法
    redis 集群出现的错误
    通信 缩略词
    redis
    hadoop 概念
    mysql 集群的一些概念
    SQL 基本关键字 函数 关联 连接
    JAVA集合
    SQL.Cookbook 读书笔记5 元数据查询
  • 原文地址:https://www.cnblogs.com/sj20082663/p/3108587.html
Copyright © 2011-2022 走看看