zoukankan      html  css  js  c++  java
  • select源码分析(linux2.6.11)

    本文以tcp poll为例子来分析select的源码,下面是函数调用顺序。
    select--->sys_select->do_select--->sock_poll--->tcp_poll
    asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
    {
        fd_set_bits fds;
        char *bits;
        long timeout;
        int ret, size, max_fdset;
    
        timeout = MAX_SCHEDULE_TIMEOUT;
        if (tvp) {
            time_t sec, usec;
    
            if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
                || (ret = __get_user(sec, &tvp->tv_sec))
                || (ret = __get_user(usec, &tvp->tv_usec)))
                goto out_nofds;
    
            ret = -EINVAL;
            if (sec < 0 || usec < 0)
                goto out_nofds;
    
            if ((unsigned long) sec < MAX_SELECT_SECONDS) {
                timeout = ROUND_UP(usec, 1000000/HZ);
                timeout += sec * (unsigned long) HZ;
            }
        }
    
        ret = -EINVAL;
        if (n < 0)
            goto out_nofds;
    
        /* max_fdset can increase, so grab it once to avoid race */
        max_fdset = current->files->max_fdset;
        if (n > max_fdset)
            n = max_fdset;
    
        ret = -ENOMEM;
        size = FDS_BYTES(n);
        bits = select_bits_alloc(size);
        if (!bits)
            goto out_nofds;
        fds.in      = (unsigned long *)  bits;
        fds.out     = (unsigned long *) (bits +   size);
        fds.ex      = (unsigned long *) (bits + 2*size);
        fds.res_in  = (unsigned long *) (bits + 3*size);
        fds.res_out = (unsigned long *) (bits + 4*size);
        fds.res_ex  = (unsigned long *) (bits + 5*size);
    
      /* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */
        if ((ret = get_fd_set(n, inp, fds.in)) ||
            (ret = get_fd_set(n, outp, fds.out)) ||
            (ret = get_fd_set(n, exp, fds.ex)))
            goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);
    
      /* 主要函数 */
        ret = do_select(n, &fds, &timeout);
    
        if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
            time_t sec = 0, usec = 0;
            if (timeout) {
                sec = timeout / HZ;
                usec = timeout % HZ;
                usec *= (1000000/HZ);
            }
            put_user(sec, &tvp->tv_sec);
            put_user(usec, &tvp->tv_usec);
        }
    
        if (ret < 0)
            goto out;
        if (!ret) {
            ret = -ERESTARTNOHAND;
            if (signal_pending(current))
                goto out;
            ret = 0;
        }
    
        if (set_fd_set(n, inp, fds.res_in) ||
            set_fd_set(n, outp, fds.res_out) ||
            set_fd_set(n, exp, fds.res_ex))
            ret = -EFAULT;
    
    out:
        select_bits_free(bits, size);
    out_nofds:
        return ret;
    }
    int do_select(int n, fd_set_bits *fds, long *timeout)
    {
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i;
        long __timeout = *timeout;
    
        spin_lock(&current->files->file_lock);
        retval = max_select_fd(n, fds);
        spin_unlock(&current->files->file_lock);
    
        if (retval < 0)
            return retval;
        n = retval;
    
        poll_initwait(&table);
        wait = &table.pt;
        if (!__timeout)
            wait = NULL;
        retval = 0;
        for (;;) {
            unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
    
            /* 设置当前的进程状态为可中断睡眠状态,但是当前进程还没有被调度出去 */
            set_current_state(TASK_INTERRUPTIBLE);
    
            inp = fds->in; outp = fds->out; exp = fds->ex;
            rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
    
            for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                unsigned long in, out, ex, all_bits, bit = 1, mask, j;
                unsigned long res_in = 0, res_out = 0, res_ex = 0;
                struct file_operations *f_op = NULL;
                struct file *file = NULL;
    
                /* 这里要跳过一些并没有关心的bit位,浪费了时间 */
                in = *inp++; out = *outp++; ex = *exp++;
                all_bits = in | out | ex;
                if (all_bits == 0) {
                    i += __NFDBITS;
                    continue;
                }
    
                /* 循环遍历所有关注的bit 位*/
                for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                    if (i >= n)
                        break;
                    if (!(bit & all_bits))
                        continue;
                    file = fget(i);
                    if (file) {
                        f_op = file->f_op;
                        mask = DEFAULT_POLLMASK;
                        if (f_op && f_op->poll)
                            /* 调用poll函数,将当前进程挂上等待队列,以及设置唤醒函数(驱动收到数据时会调用唤醒函数唤醒进程)。并获取当前关心的fd的可读、可写、异常情况
                              (套接字的sock_poll 初始化在socket_file_ops)*/
                            mask = (*f_op->poll)(file, retval ? NULL : wait);
                        fput(file);
                        /* 表示可读 */
                        if ((mask & POLLIN_SET) && (in & bit)) {
                            res_in |= bit;
                            retval++;
                        }
                        /* 表示可写 */
                        if ((mask & POLLOUT_SET) && (out & bit)) {
                            res_out |= bit;
                            retval++;
                        }
                        /* 表示异常 */
                        if ((mask & POLLEX_SET) && (ex & bit)) {
                            res_ex |= bit;
                            retval++;
                        }
                    }
                    /**
                    * 如果有必要,就重新调度进程
                    */
                    cond_resched();
                }
                if (res_in)
                    *rinp = res_in;
                if (res_out)
                    *routp = res_out;
                if (res_ex)
                    *rexp = res_ex;
            }
            /* 遍历完后,检查retval,看是否有可读可写异常,如果有retval不为0,那么则退出死循环 */
            wait = NULL;
            if (retval || !__timeout || signal_pending(current))
                break;
            if(table.error) {
                retval = table.error;
                break;
            }
            /* 如果上面没有检查到关心的bit位有可读可写异常。如果调用select时设置的是无限等待,
              那么下面函数会进行进程调度,将当前进程调度出去。驱动收到数据时会调换用poll函数设置的唤醒函数,来唤醒当前进程对关心的bit位进行重新检查*/
            __timeout = schedule_timeout(__timeout);
        }
        __set_current_state(TASK_RUNNING);
    
        poll_freewait(&table);
    
        /*
         * Up-to-date the caller timeout.
         */
        *timeout = __timeout;
        return retval;
    }
    /* No kernel lock held - perfect */
    static unsigned int sock_poll(struct file *file, poll_table * wait)
    {
        struct socket *sock;
    
        /*
         *  We can't return errors to poll, so it's either yes or no.
         */
        sock = SOCKET_I(file->f_dentry->d_inode);
        /* 例子 tcp_poll */
        return sock->ops->poll(file, sock, wait);
    }
    unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
    {
        unsigned int mask;
        struct sock *sk = sock->sk;
        struct tcp_sock *tp = tcp_sk(sk);
    
        /* 将当前进程加入等待队列,并且有唤醒函数 */
        poll_wait(file, sk->sk_sleep, wait);
        if (sk->sk_state == TCP_LISTEN)
            return tcp_listen_poll(sk, wait);
    
        mask = 0;
        if (sk->sk_err)
            mask = POLLERR;
    
        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
            mask |= POLLHUP;
        if (sk->sk_shutdown & RCV_SHUTDOWN)
            mask |= POLLIN | POLLRDNORM;
    
        /* Connected? */
        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
            /* Potential race condition. If read of tp below will
             * escape above sk->sk_state, we can be illegally awaken
             * in SYN_* states. */
            if ((tp->rcv_nxt != tp->copied_seq) &&
                (tp->urg_seq != tp->copied_seq ||
                 tp->rcv_nxt != tp->copied_seq + 1 ||
                 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
                mask |= POLLIN | POLLRDNORM;
    
            if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
                    mask |= POLLOUT | POLLWRNORM;
                } else {  /* send SIGIO later */
                    set_bit(SOCK_ASYNC_NOSPACE,
                        &sk->sk_socket->flags);
                    set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    
                    /* Race breaker. If space is freed after
                     * wspace test but before the flags are set,
                     * IO signal will be lost.
                     */
                    if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
                        mask |= POLLOUT | POLLWRNORM;
                }
            }
    
            if (tp->urg_data & TCP_URG_VALID)
                mask |= POLLPRI;
        }
        return mask;
    }
    /*真正的等待处 ,每个监控调用一次 */
    void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
    {
        struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
        struct poll_table_page *table = p->table;
    
        if (!table || POLL_TABLE_FULL(table)) {
            struct poll_table_page *new_table;
    
            new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
            if (!new_table) {
                p->error = -ENOMEM;
                __set_current_state(TASK_RUNNING);
                return;
            }
            new_table->entry = new_table->entries;
            new_table->next = table;
            p->table = new_table;
            table = new_table;
        }
    
        /* Add a new entry */
        {
            struct poll_table_entry * entry = table->entry;
            table->entry = entry+1;
            get_file(filp);
            entry->filp = filp;
            entry->wait_address = wait_address;
            /*  添加当前进程到等待队列, 这里面含有唤醒函数 */
            init_waitqueue_entry(&entry->wait, current);
            add_wait_queue(wait_address,&entry->wait);
        }
    }
    /**
     * 非互斥进程由default_wake_function唤醒。它是try_to_wake_up的一个简单封装。
     */
    int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
    {
        task_t *p = curr->task;
        return try_to_wake_up(p, mode, sync);
    }
    /**
     * 通过把进程状态设置为TASK_RUNNING,并把该进程插入本地CPU的运行队列来唤醒睡眠或停止的进程
     * p-被唤醒进程的描述符
     * state-可以被唤醒的进程状态掩码。
     * sync-一个标志,用来禁止被唤醒的进程抢占本地CPU上正在运行的进程。
     */
    static int try_to_wake_up(task_t * p, unsigned int state, int sync)
    {
        int cpu, this_cpu, success = 0;
        unsigned long flags;
        long old_state;
        runqueue_t *rq;
    #ifdef CONFIG_SMP
        unsigned long load, this_load;
        struct sched_domain *sd;
        int new_cpu;
    #endif
    
        /**
         * 调用task_rq_lock来禁止中断,并获得进程所在CPU上的运行队列的锁(可能与当前CPU的运行队列不一样,并且被唤醒的进程可能并不在队列上)
         */
        rq = task_rq_lock(p, &flags);
        schedstat_inc(rq, ttwu_cnt);
        old_state = p->state;
        /**
         * 只唤醒state对应状态的进程。如果被唤醒的进程状态不在state中,直接退出。本次唤醒无效。
         * 例如:通过信号就不会唤醒TASK_UNINTERRUPTIBLE状态的进程。
         */
        if (!(old_state & state))
            goto out;
    
        /**
         * 如果进程已经属于某个运行队列,就跳转到out_running,将它的状态修改为TASK_RUNNING状态后退出。
         */
        if (p->array)
            goto out_running;
    
        cpu = task_cpu(p);
        this_cpu = smp_processor_id();
    
    #ifdef CONFIG_SMP
        /**
         * 在SMP上,需要检查被唤醒的进程是否应该从最近运行的CPU的运行队列迁移到另外一个CPU的运行队列。
         */
    
        /**
         * 被唤醒任务正在CPU上运行,不必考虑迁移了。
         */
        if (unlikely(task_running(rq, p)))
            goto out_activate;
    
        /**
         * 优先将进程放到进程所在CPU上运行。
         */
        new_cpu = cpu;
    
        /**
         * 如果进程所在CPU就是当前进程所在CPU,或者被唤醒进程不允许在当前进程所在CPU上运行,那么跳转到out_set_cpu
         */
        if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
            goto out_set_cpu;
    
        load = source_load(cpu);
        this_load = target_load(this_cpu);
    
        /*
         * If sync wakeup then subtract the (maximum possible) effect of
         * the currently running task from the load of the current CPU:
         */
        if (sync)
            this_load -= SCHED_LOAD_SCALE;
    
        /* Don't pull the task off an idle CPU to a busy one */
        /**
         * 如果被唤醒任务所在的CPU工作量小于当前CPU的工作量,也跳转到out_set_cpu
         */
        if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
            goto out_set_cpu;
    
        /**
         * 试图将进程迁移到本地CPU。
         */
        new_cpu = this_cpu; /* Wake to this CPU if we can */
    
        /*
         * Scan domains for affine wakeup and passive balancing
         * possibilities.
         */
        for_each_domain(this_cpu, sd) {
            unsigned int imbalance;
            /*
             * Start passive balancing when half the imbalance_pct
             * limit is reached.
             */
            imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
    
            if ((sd->flags & SD_WAKE_AFFINE) &&
                    !task_hot(p, rq->timestamp_last_tick, sd)) {
                /*
                 * This domain has SD_WAKE_AFFINE and p is cache cold
                 * in this domain.
                 */
                if (cpu_isset(cpu, sd->span)) {
                    schedstat_inc(sd, ttwu_wake_affine);
                    goto out_set_cpu;
                }
            } else if ((sd->flags & SD_WAKE_BALANCE) &&
                    imbalance*this_load <= 100*load) {
                /*
                 * This domain has SD_WAKE_BALANCE and there is
                 * an imbalance.
                 */
                if (cpu_isset(cpu, sd->span)) {
                    schedstat_inc(sd, ttwu_wake_balance);
                    goto out_set_cpu;
                }
            }
        }
    
        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
    out_set_cpu:
        schedstat_inc(rq, ttwu_attempts);
        new_cpu = wake_idle(new_cpu, p);
        if (new_cpu != cpu) {
            schedstat_inc(rq, ttwu_moved);
            set_task_cpu(p, new_cpu);
            task_rq_unlock(rq, &flags);
            /* might preempt at this point */
            rq = task_rq_lock(p, &flags);
            old_state = p->state;
            if (!(old_state & state))
                goto out;
            if (p->array)
                goto out_running;
    
            this_cpu = smp_processor_id();
            cpu = task_cpu(p);
        }
    
    out_activate:
    #endif /* CONFIG_SMP */
        /**
         * 如果是TASK_UNINTERRUPTIBLE,就递减nr_uninterruptible
         * 并将activated设为-1,表示进程是从TASK_UNINTERRUPTIBLE状态被唤醒这个事实。
         */
        if (old_state == TASK_UNINTERRUPTIBLE) {
            rq->nr_uninterruptible--;
            /*
             * Tasks on involuntary sleep don't earn
             * sleep_avg beyond just interactive state.
             */
            p->activated = -1;
        }
    
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
         * don't trigger a preemption, if the woken up task will run on
         * this cpu. (in this case the 'I will reschedule' promise of
         * the waker guarantees that the freshly woken up task is going
         * to be considered on this CPU.)
         */
        /**
         * activate_task函数依次执行以下步骤澹?
         *     1:调用sched_clock获得当前时间戳,如果目标CPU不是本地CPU,那么还会补偿时钟中断的偏差。
         *     2:调用recalc_task_prio,计算进程的动态优先级。
         *     3:根据情况设置activated
         *     4:设置进程的时间戳。
         *     5:将进程插入进程集合。
         */
        activate_task(p, rq, cpu == this_cpu);
        /**
         * 如果目标CPU不是本地CPU,或者没有SYNC标志,就检查新进程的动态优先级是否比运行队列中当前进程的优先级高。
         */
        if (!sync || cpu != this_cpu) {
            if (TASK_PREEMPTS_CURR(p, rq))/* 进程的优先级比所在队列的当前进程优先级高,需要抢占。 */
                /**
                 * resched_task函数进行进程抢占。
                 * 在单处理器上,它仅仅设置TIF_NEED_RESCHED标志。
                 * 在多处理器上,它可能会发送IPI,强制让CPU产生调度。
                 */
                resched_task(rq->curr);
        }
        success = 1;
    
    out_running:
        /**
         * 将进程状态设置为为TASK_RUNNING,注意两个流程会走到这里。
         */
        p->state = TASK_RUNNING;
    out:
        /**
         * 开中断并打开运行队列的锁。
         */
        task_rq_unlock(rq, &flags);
    
        /**
         * 返回0:进程没有被唤醒。否则返回1,进程被唤醒。
         */
        return success;
    }

    当底层驱动收到数据后,会产生中断信号,调用 default_wake_function函数来唤醒对应的进程,唤醒后进程继续do_select来检查关心的bit位。至于驱动具体是如何通知上层的,还需要进一步学习与分析。

  • 相关阅读:
    请求的详细资料级别没有事实表
    BIEE汇总数据如何放在后面
    Biee仪表盘中提示空值如何去掉
    UFT测试本地应用程序登陆小实例(描述性编程)
    Mysql找回丢失密码
    linux下Mysql多实例实现
    如何从零安装Mysql
    Linux系统下yum源配置(Centos 6)
    Linux系统管理常用命令用法总结(2)
    Linux系统管理常用命令用法总结(1)
  • 原文地址:https://www.cnblogs.com/jaydenhpj/p/5121030.html
Copyright © 2011-2022 走看看