zoukankan      html  css  js  c++  java
  • Linux中的进程调度(六)

    从现在开始来分析和负载平衡有关的策略。
    /*
     * This function gets called by the timer code, with HZ frequency.
     * We call it with interrupts disabled.
     *
     * It also gets called by the fork code, when changing the parent's
     * timeslices.
     */
    void scheduler_tick(void)
    {
    	int cpu = smp_processor_id();
    	struct rq *rq = cpu_rq(cpu);
    	struct task_struct *curr = rq->curr;
    
    	sched_clock_tick();
    
    	spin_lock(&rq->lock);
    	update_rq_clock(rq);
    	update_cpu_load(rq);
    	curr->sched_class->task_tick(rq, curr, 0);
    	spin_unlock(&rq->lock);
    
    #ifdef CONFIG_SMP
    	rq->idle_at_tick = idle_cpu(cpu);//检查当前cpu运行队列是否为空(只有idle进程)
    	trigger_load_balance(rq, cpu);
    #endif
    }
    可见,在每次处理时钟中断时,在最后会检查一下是否需要进行一次负载平衡。 进入到trigger_load_balance中去,从名字就可以猜出个大概。
    /*
     * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
     *
     * In case of CONFIG_NO_HZ, this is the place where we nominate a new
     * idle load balancing owner or decide to stop the periodic load balancing,
     * if the whole system is idle.
     */
    static inline void trigger_load_balance(struct rq *rq, int cpu)
    {
    #ifdef CONFIG_NO_HZ
    	/*
    	 * If we were in the nohz mode recently and busy at the current
    	 * scheduler tick, then check if we need to nominate new idle
    	 * load balancer.
    	 */
    	if (rq->in_nohz_recently && !rq->idle_at_tick) {
    		rq->in_nohz_recently = 0;
    
    		if (atomic_read(&nohz.load_balancer) == cpu) {
    			cpumask_clear_cpu(cpu, nohz.cpu_mask);
    			atomic_set(&nohz.load_balancer, -1);
    		}
    
    		if (atomic_read(&nohz.load_balancer) == -1) {
    			/*
    			 * simple selection for now: Nominate the
    			 * first cpu in the nohz list to be the next
    			 * ilb owner.
    			 *
    			 * TBD: Traverse the sched domains and nominate
    			 * the nearest cpu in the nohz.cpu_mask.
    			 */
    			int ilb = cpumask_first(nohz.cpu_mask);
    
    			if (ilb < nr_cpu_ids)
    				resched_cpu(ilb);
    		}
    	}
    
    	/*
    	 * If this cpu is idle and doing idle load balancing for all the
    	 * cpus with ticks stopped, is it time for that to stop?
    	 */
    	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
    	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
    		resched_cpu(cpu);
    		return;
    	}
    
    	/*
    	 * If this cpu is idle and the idle load balancing is done by
    	 * someone else, then no need raise the SCHED_SOFTIRQ
    	 */
    	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
    	    cpumask_test_cpu(cpu, nohz.cpu_mask))
    		return;
    #endif
    	if (time_after_eq(jiffies, rq->next_balance))
    		raise_softirq(SCHED_SOFTIRQ);
    }
    忽略掉CONFIG_NO_HZ的部分,可以看到,这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大,如果值大的话,会进一步调用raise_softirq提交一个软中断。提交的过程很简单,就是把SCHED_SOFTIRQ对应的位置位,处理软中断时检查是否位,如果置位调用相应的软中断处理函数。 用cscope在源码中搜索,发现有如下语句:
     #ifdef CONFIG_SMP
         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
     #endif
    呵,这个软中断还是特别为SMP架构准备的呢~ 顺着这个线索,去查看run_rebalance_domains的实现
    /*
     * run_rebalance_domains is triggered when needed from the scheduler tick.
     * In CONFIG_NO_HZ case, the idle load balance owner will do the
     * rebalancing for all the cpus for whom scheduler ticks are stopped.
     */
    static void run_rebalance_domains(struct softirq_action *h)
    {
    	int this_cpu = smp_processor_id();
    	struct rq *this_rq = cpu_rq(this_cpu);
    	enum cpu_idle_type idle = this_rq->idle_at_tick ?
    						CPU_IDLE : CPU_NOT_IDLE;
    
    	rebalance_domains(this_cpu, idle);
    
    #ifdef CONFIG_NO_HZ
    	/*
    	 * If this cpu is the owner for idle load balancing, then do the
    	 * balancing on behalf of the other idle cpus whose ticks are
    	 * stopped.
    	 */
    	if (this_rq->idle_at_tick &&
    	    atomic_read(&nohz.load_balancer) == this_cpu) {
    		struct rq *rq;
    		int balance_cpu;
    
    		for_each_cpu(balance_cpu, nohz.cpu_mask) {
    			if (balance_cpu == this_cpu)
    				continue;
    
    			/*
    			 * If this cpu gets work to do, stop the load balancing
    			 * work being done for other cpus. Next load
    			 * balancing owner will pick it up.
    			 */
    			if (need_resched())
    				break;
    
    			rebalance_domains(balance_cpu, CPU_IDLE);
    
    			rq = cpu_rq(balance_cpu);
    			if (time_after(this_rq->next_balance, rq->next_balance))
    				this_rq->next_balance = rq->next_balance;
    		}
    	}
    #endif
    }
    忽略CONFIG_NO_HZ,那么这个函数就是根据当前cpu的负载状态(为idle进程还是其它)确定idle参数,然后调用rebalance_domains
    /*
     * It checks each scheduling domain to see if it is due to be balanced,
     * and initiates a balancing operation if so.
     *
     * Balancing parameters are set up in arch_init_sched_domains.
     */
    static void rebalance_domains(int cpu, enum cpu_idle_type idle)
    {
    	int balance = 1;
    	struct rq *rq = cpu_rq(cpu);
    	unsigned long interval;
    	struct sched_domain *sd;
    	/* Earliest time when we have to do rebalance again */
    	unsigned long next_balance = jiffies + 60*HZ;
    	int update_next_balance = 0;
    	int need_serialize;
    	cpumask_var_t tmp;
    
    	/* Fails alloc?  Rebalancing probably not a priority right now. */
    	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
    		return;
    
    	for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域
    		if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡,则跳过
    			continue;
    
    		interval = sd->balance_interval;//得到该调度域的平衡周期
    		if (idle != CPU_IDLE)
    			interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正
    
    		/* scale ms to jiffies */
    		interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数
    		if (unlikely(!interval))
    			interval = 1;
    		if (interval > HZ*NR_CPUS/10)//继续修正
    			interval = HZ*NR_CPUS/10;
    
    		need_serialize = sd->flags & SD_SERIALIZE;
    
    		if (need_serialize) {
    			if (!spin_trylock(&balancing))
    				goto out;
    		}
    
    		if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了
    			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
    				/*
    				 * We've pulled tasks over so either we're no
    				 * longer idle, or one of our SMT siblings is
    				 * not idle.
    				 */
    				idle = CPU_NOT_IDLE;//注释已经写的很清楚
    			}
    			sd->last_balance = jiffies;//更新最后一次平衡的时间
    		}
    		if (need_serialize)
    			spin_unlock(&balancing);
    out:
    		if (time_after(next_balance, sd->last_balance + interval)) {//设置下一次进行平衡操作的时间
    			next_balance = sd->last_balance + interval;
    			update_next_balance = 1;
    		}
    
    		/*
    		 * Stop the load balance at this level. There is another
    		 * CPU in our sched group which is doing load balancing more
    		 * actively.
    		 */
    		if (!balance)
    			break;
    	}
    
    	/*
    	 * next_balance will be updated only when there is a need.
    	 * When the cpu is attached to null domain for ex, it will not be
    	 * updated.
    	 */
    	if (likely(update_next_balance))
    		rq->next_balance = next_balance;
    
    	free_cpumask_var(tmp);
    }
    重点就是load_balance了
    /*
     * Check this_cpu to ensure it is balanced within domain. Attempt to move
     * tasks if there is an imbalance.
     */
    static int load_balance(int this_cpu, struct rq *this_rq,
    			struct sched_domain *sd, enum cpu_idle_type idle,
    			int *balance, struct cpumask *cpus)
    {
    	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
    	struct sched_group *group;
    	unsigned long imbalance;
    	struct rq *busiest;
    	unsigned long flags;
    
    	cpumask_setall(cpus);//先将所有cpu置位
    
    	/*
    	 * When power savings policy is enabled for the parent domain, idle
    	 * sibling can pick up load irrespective of busy siblings. In this case,
    	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
    	 * portraying it as CPU_NOT_IDLE.
    	 */
    	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
    	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中,SHARE_CPUPOWER不会出现
    		sd_idle = 1;
    
    	schedstat_inc(sd, lb_count[idle]);//更新统计信息
    
    redo:
    	update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
    	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
    				   cpus, balance);//找也该调度域中最忙的调度组
    
    	if (*balance == 0)
    		goto out_balanced;
    
    	if (!group) {
    		schedstat_inc(sd, lb_nobusyg[idle]);
    		goto out_balanced;
    	}
    
    	busiest = find_busiest_queue(group, idle, imbalance, cpus);
    	if (!busiest) {
    		schedstat_inc(sd, lb_nobusyq[idle]);
    		goto out_balanced;
    	}
    
    	BUG_ON(busiest == this_rq);
    
    	schedstat_add(sd, lb_imbalance[idle], imbalance);
    
    	ld_moved = 0;
    	if (busiest->nr_running > 1) {
    		/*
    		 * Attempt to move tasks. If find_busiest_group has found
    		 * an imbalance but busiest->nr_running nr_balance_failed++;
    
    		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
    
    			spin_lock_irqsave(&busiest->lock, flags);
    
    			/* don't kick the migration_thread, if the curr
    			 * task on busiest cpu can't be moved to this_cpu
    			 */
    			if (!cpumask_test_cpu(this_cpu,
    					      &busiest->curr->cpus_allowed)) {
    				spin_unlock_irqrestore(&busiest->lock, flags);
    				all_pinned = 1;
    				goto out_one_pinned;
    			}
    
    			if (!busiest->active_balance) {
    				busiest->active_balance = 1;
    				busiest->push_cpu = this_cpu;
    				active_balance = 1;
    			}
    			spin_unlock_irqrestore(&busiest->lock, flags);
    			if (active_balance)
    				wake_up_process(busiest->migration_thread);
    
    			/*
    			 * We've kicked active balancing, reset the failure
    			 * counter.
    			 */
    			sd->nr_balance_failed = sd->cache_nice_tries+1;
    		}
    	} else
    		sd->nr_balance_failed = 0;
    
    	if (likely(!active_balance)) {
    		/* We were unbalanced, so reset the balancing interval */
    		sd->balance_interval = sd->min_interval;
    	} else {
    		/*
    		 * If we've begun active balancing, start to back off. This
    		 * case may not be covered by the all_pinned logic if there
    		 * is only 1 task on the busy runqueue (because we don't call
    		 * move_tasks).
    		 */
    		if (sd->balance_interval max_interval)
    			sd->balance_interval *= 2;
    	}
    
    	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
    	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
    		ld_moved = -1;
    
    	goto out;
    
    out_balanced:
    	schedstat_inc(sd, lb_balanced[idle]);
    
    	sd->nr_balance_failed = 0;
    
    out_one_pinned:
    	/* tune up the balancing interval */
    	if ((all_pinned && sd->balance_interval balance_interval max_interval))
    		sd->balance_interval *= 2;
    
    	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
    	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
    		ld_moved = -1;
    	else
    		ld_moved = 0;
    out:
    	if (ld_moved)
    		update_shares(sd);
    	return ld_moved;
    }
    
    
    其中update_shares有必要去看一下
    static void update_shares(struct sched_domain *sd)
    {
    	u64 now = cpu_clock(raw_smp_processor_id());
    	s64 elapsed = now - sd->last_update;
    
    	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认,是不是需要将此调度域中的每个进程组的share值更新
    		sd->last_update = now;
    		walk_tg_tree(tg_nop, tg_shares_up, sd);
    	}
    }
    
    
    如果真的需要去更新该调度域的各个进程组的share值的话,将调用wsalk_tg_tree进行更新操作,tg_nop,shares_up是两个函数指针,其中在这里,tg_nop进行空操作,shares_up将进行真正的更新操作。
    /*
     * Iterate the full tree, calling @down when first entering a node and @up when
     * leaving it for the final time.
     */
    static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
    {
    	struct task_group *parent, *child;
    	int ret;
    
    	rcu_read_lock();
    	parent = &root_task_group;
    down:
    	ret = (*down)(parent, data);
    	if (ret)
    		goto out_unlock;
    	list_for_each_entry_rcu(child, &parent->children, siblings) {
    		parent = child;
    		goto down;
    
    up:
    		continue;
    	}
    	ret = (*up)(parent, data);
    	if (ret)
    		goto out_unlock;
    
    	child = parent;
    	parent = parent->parent;
    	if (parent)
    		goto up;
    out_unlock:
    	rcu_read_unlock();
    
    	return ret;
    }
    
    代码比较难读,不如自己在纸上画个图,实际走一遍,就看清楚了,这里实际上就是从下而上,从左到右,依次更新每个调度组的share值,具体的更新方法在shares_up函数中体现。
    //注释中也说明了刚才的遍历方法
    /*
     * Re-compute the task group their per cpu shares over the given domain.
     * This needs to be done in a bottom-up fashion because the rq weight of a
     * parent group depends on the shares of its child groups.
     */
    static int tg_shares_up(struct task_group *tg, void *data)
    {
    	unsigned long weight, rq_weight = 0;
    	unsigned long shares = 0;
    	struct sched_domain *sd = data;
    	int i;
    
    	for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu
    		/*
    		 * If there are currently no tasks on the cpu pretend there
    		 * is one of average load so that when a new task gets to
    		 * run here it will not get delayed by group starvation.
    		 */
    		weight = tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加
    		if (!weight)//如果在该cpu上没有负载,就要分一些过来了,注意与下一条语句联系
    			weight = NICE_0_LOAD;
    
    		tg->cfs_rq[i]->rq_weight = weight;//注意这里是cfs_rq的rq_weight,
    		rq_weight += weight;//计算总的rq_weight
    		shares += tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加
    	}
    	//进行一下修正
    	if ((!shares && rq_weight) || shares > tg->shares)
    		shares = tg->shares;
    
    	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说,这个条件是满足的,
    		shares = tg->shares;//shares值直接变成了该调度组的shares值了
    
    	for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后,再来一次循环,这次要更新了
    		update_group_shares_cpu(tg, i, shares, rq_weight);
    
    	return 0;
    }
    
    再来看update_group_shares_cpu
    /*
     * Calculate and set the cpu's group shares.
     */
    static void//注意这里的参数,tg就是在刚才树的遍历中遍历到的组,cpu是该调度域中的cpu i,sd_shares是该调度组的shares值,sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和
    update_group_shares_cpu(struct task_group *tg, int cpu,
    			unsigned long sd_shares, unsigned long sd_rq_weight)
    {
    	unsigned long shares;
    	unsigned long rq_weight;
    
    	if (!tg->se[cpu])
    		return;
    
    	rq_weight = tg->cfs_rq[cpu]->rq_weight;
    
    	/*
    	 *           Sum shares * rq_weight
    	 * shares =  -----------------------
    	 *               Sum rq_weight
    	 *
    	 */
    	shares = (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了
    	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正
    
    	if (abs(shares - tg->se[cpu]->load.weight) >
    			sysctl_sched_shares_thresh) {//为了避免操作过于频繁,只有结果大于一个可控值时,才进行更新。
    		struct rq *rq = cpu_rq(cpu);
    		unsigned long flags;
    
    		spin_lock_irqsave(&rq->lock, flags);
    		tg->cfs_rq[cpu]->shares = shares;
    
    		__set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去
    		spin_unlock_irqrestore(&rq->lock, flags);
    	}
    }
    
    
    注意上面的计算方法,是说对组内的cpu来讲,共同来分担该组的shares值,具体的分担方法是,按比例来,哪个cpu的负载占所有cpu负载的百分比大,哪个cpu分得的shares值也就大一些,优先级就大一些,运行的时候就会多一些筹码,这里需要返回去看pick_up_next的部分代码 __set_se_shares代码如下
    static void __set_se_shares(struct sched_entity *se, unsigned long shares)
    {
    	struct cfs_rq *cfs_rq = se->cfs_rq;
    	int on_rq;
    
    	on_rq = se->on_rq;
    	if (on_rq)
    		dequeue_entity(cfs_rq, se, 0);
    
    	se->load.weight = shares;
    	se->load.inv_weight = 0;
    
    	if (on_rq)
    		enqueue_entity(cfs_rq, se, 0);
    }
    
    
    很好理解,先移出可执行队列,更新其负载后,再移入可执行队列。 update_shares到这里就分析完了,注意update_shares的执行时间是在已经确定需要进行负载平衡,但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况,有助于下面的调度组以及进程的选择。 返回刚才的load_balance函数,继续往下进行。
    redo:
    	update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
    	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
    				   cpus, balance);//找到该调度域中最忙的调度组
    
    	if (*balance == 0)
    		goto out_balanced;
    
    	if (!group) {//如果都不太忙,当然不需要平衡操作
    		schedstat_inc(sd, lb_nobusyg[idle]);
    		goto out_balanced;
    	}
    
    	busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu
    	if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作
    		schedstat_inc(sd, lb_nobusyq[idle]);
    		goto out_balanced;
    	}
    
    	BUG_ON(busiest == this_rq);
    
    	schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息
    
    	ld_moved = 0;//是否移动了某些进程的标志
    	if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动
    		/*
    		 * Attempt to move tasks. If find_busiest_group has found
    		 * an imbalance but busiest->nr_running <= 1, the group is
    		 * still unbalanced. ld_moved simply stays zero, so it is
    		 * correctly treated as an imbalance.
    		 */
    		local_irq_save(flags);
    		double_rq_lock(this_rq, busiest);
    		ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。
    				      imbalance, sd, idle, &all_pinned);
    		double_rq_unlock(this_rq, busiest);
    		local_irq_restore(flags);
    
    		/*
    		 * some other cpu did the load balance for us.
    		 */
    		if (ld_moved && this_cpu != smp_processor_id())
    			resched_cpu(this_cpu);
    
    		/* All tasks on this runqueue were pinned by CPU affinity */
    		if (unlikely(all_pinned)) {
    			cpumask_clear_cpu(cpu_of(busiest), cpus);
    			if (!cpumask_empty(cpus))
    				goto redo;
    			goto out_balanced;
    		}
    	}
    
    先看 find_busiest_group函数,这个函数比较长
    /*
     * find_busiest_group finds and returns the busiest CPU group within the
     * domain. It calculates and returns the amount of weighted load which
     * should be moved to restore balance via the imbalance parameter.
     */
    static struct sched_group *
    find_busiest_group(struct sched_domain *sd, int this_cpu,
    		   unsigned long *imbalance, enum cpu_idle_type idle,
    		   int *sd_idle, const struct cpumask *cpus, int *balance)
    {
    	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
    	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
    	unsigned long max_pull;
    	unsigned long busiest_load_per_task, busiest_nr_running;
    	unsigned long this_load_per_task, this_nr_running;
    	int load_idx, group_imb = 0;
    #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
    	int power_savings_balance = 1;
    	unsigned long leader_nr_running = 0, min_load_per_task = 0;
    	unsigned long min_nr_running = ULONG_MAX;
    	struct sched_group *group_min = NULL, *group_leader = NULL;
    #endif
    
    	max_load = this_load = total_load = total_pwr = 0;
    	busiest_load_per_task = busiest_nr_running = 0;
    	this_load_per_task = this_nr_running = 0;
    
    	if (idle == CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值,这个值在下面寻找最忙调度组(不是进程组)时会作为一个重要指标
    		load_idx = sd->busy_idx;//busy_idx默认为3
    	else if (idle == CPU_NEWLY_IDLE)
    		load_idx = sd->newidle_idx;//newidle_idx为2
    	else
    		load_idx = sd->idle_idx;idle_idx为1
    
    	do {//从这里一直到while(group!=sd->groups)是一个大循环,其目的就是遍布这个调度域中所有的调度组,找出最忙的那个,其中,this_cpu所属的调度组不参与与其它调度组的竞争
    		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
    		int local_group;
    		int i;
    		int __group_imb = 0;
    		unsigned int balance_cpu = -1, first_idle_cpu = 0;
    		unsigned long sum_nr_running, sum_weighted_load;
    		unsigned long sum_avg_load_per_task;
    		unsigned long avg_load_per_task;
    
    		local_group = cpumask_test_cpu(this_cpu,
    					       sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组,那么将local_group置位
    
    		if (local_group)
    			balance_cpu = cpumask_first(sched_group_cpus(group));//如果正在处理"local_group",那么将balance_cpu暂定为该组中第一个cpu
    
    		/* Tally up the load of all CPUs in the group */
    		sum_weighted_load = sum_nr_running = avg_load = 0;
    		sum_avg_load_per_task = avg_load_per_task = 0;
    
    		max_cpu_load = 0;
    		min_cpu_load = ~0UL;
    
    		for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpu
    			struct rq *rq = cpu_rq(i);
    
    			if (*sd_idle && rq->nr_running)
    				*sd_idle = 0;
    
    			/* Bias balancing toward cpus of our domain */
    			if (local_group) {//如果是本地组,且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正
    				if (idle_cpu(i) && !first_idle_cpu) {
    					first_idle_cpu = 1;
    					balance_cpu = i;//将balance_cpu置为i,仔细考虑下,这里的逻辑就是说,如果本地组中有空闲cpu,那么就将第一个空闲cpu作为balance_cpu,否则,将该组中第一个cpu作为balance_cpu
    				}
    
    				load = target_load(i, load_idx);//累加计算该组的负载,增加的数目要根据前面确定的load_idx来确定
    			} else {//如果当前组不是本地组
    				load = source_load(i, load_idx);//同上
    				if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载,或者小于已经找到的最小负载,则更新最大/最小值
    					max_cpu_load = load;
    				if (min_cpu_load > load)
    					min_cpu_load = load;
    			}
    
    			avg_load += load;//根据load_idx计算出来的负载之和
    			sum_nr_running += rq->nr_running;//组内各个cpu上可运行队列中进程数目之和
    			sum_weighted_load += weighted_cpuload(i);//该组内当前所有cpu的负载之和,注意这里是当前的,和avg_load不同,因为avg_load的计算涉及到历史值,也就是和load_idx有关
    
    			sum_avg_load_per_task += cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载
    		}
    
    		/*
    		 * First idle cpu or the first cpu(busiest) in this sched group
    		 * is eligible for doing load balancing at this and above
    		 * domains. In the newly idle case, we will allow all the cpu's
    		 * to do the newly idle load balance.
    		 */
    		if (idle != CPU_NEWLY_IDLE && local_group &&
    		    balance_cpu != this_cpu && balance) {
    			*balance = 0;
    			goto ret;
    		}
    
    		total_load += avg_load;//调度域的总负载
    		total_pwr += group->__cpu_power;//这个cpu_power还没弄清是怎么回事
    
    		/* Adjust by relative CPU power of the group */
    		avg_load = sg_div_cpu_power(group,
    				avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load
    
    
    		/*
    		 * Consider the group unbalanced when the imbalance is larger
    		 * than the average weight of two tasks.
    		 *
    		 * APZ: with cgroup the avg task weight can vary wildly and
    		 *      might not be a suitable number - should we keep a
    		 *      normalized nr_running number somewhere that negates
    		 *      the hierarchy?
    		 */
    		avg_load_per_task = sg_div_cpu_power(group,
    				sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样,修正该组的avg_load_per_task值
    
    		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍,则__group_imb(imbalance)置1,下面会看到它的作用
    			__group_imb = 1;
    
    		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
    
    		if (local_group) {//如果是本地组,只更新this相关的变量,并不更新busiest指针的指向
    			this_load = avg_load;
    			this = group;
    			this_nr_running = sum_nr_running;
    			this_load_per_task = sum_weighted_load;
    		} else if (avg_load > max_load &&
    			   (sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况,或者组内的进程数目已经超过了该组的能力,且该组的平均负载大于已知的其它组的最大平均负载
    			max_load = avg_load;//更新最大值
    			busiest = group;//更新指针指向
    			busiest_nr_running = sum_nr_running;//更新最忙组中的进程数目
    			busiest_load_per_task = sum_weighted_load;
    			group_imb = __group_imb;
    		}
    
    #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码,直到整个do-while循环结束
    		/*
    		 * Busy processors will not participate in power savings
    		 * balance.
    		 */
    		if (idle == CPU_NOT_IDLE ||
    				!(sd->flags & SD_POWERSAVINGS_BALANCE))
    			goto group_next;
    
    		/*
    		 * If the local group is idle or completely loaded
    		 * no need to do power savings balance at this domain
    		 */
    		if (local_group && (this_nr_running >= group_capacity ||
    				    !this_nr_running))
    			power_savings_balance = 0;
    
    		/*
    		 * If a group is already running at full capacity or idle,
    		 * don't include that group in power savings calculations
    		 */
    		if (!power_savings_balance || sum_nr_running >= group_capacity
    		    || !sum_nr_running)
    			goto group_next;
    
    		/*
    		 * Calculate the group which has the least non-idle load.
    		 * This is the group from where we need to pick up the load
    		 * for saving power
    		 */
    		if ((sum_nr_running 
    		     cpumask_first(sched_group_cpus(group_min)))) {
    			group_min = group;
    			min_nr_running = sum_nr_running;
    			min_load_per_task = sum_weighted_load /
    						sum_nr_running;
    		}
    
    		/*
    		 * Calculate the group which is almost near its
    		 * capacity but still has some space to pick up some load
    		 * from other group and save more power
    		 */
    		if (sum_nr_running  leader_nr_running ||
    			    (sum_nr_running == leader_nr_running &&
    			     cpumask_first(sched_group_cpus(group)) next;
    	} while (group != sd->groups);//至此,已经将该调度域中所有调度组全部遍历完,如果有符合条件的最忙调度组的话,busiest已经指向它
    
    	if (!busiest || this_load >= max_load || busiest_nr_running == 0)//没有符合条件的,或者本地调度组比找到的那一组还要忙,或者最忙的组中已经没有进程,则不需要平衡
    		goto out_balanced;
    
    	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
    
    	if (this_load >= avg_load ||
    			100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件,当前组负载大于平均平均负载,或者最大负载与当前组负载之比小于某个值
    		goto out_balanced;
    
    	busiest_load_per_task /= busiest_nr_running;//最忙组中每个进程的平均负载
    	if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍,则进行一下修正
    		busiest_load_per_task = min(busiest_load_per_task, avg_load);
    
    	/*
    	 * We're trying to get all the cpus to the average_load, so we don't
    	 * want to push ourselves above the average load, nor do we wish to
    	 * reduce the max loaded cpu below the average load, as either of these
    	 * actions would just result in more rebalancing later, and ping-pong
    	 * tasks around. Thus we look for the minimum possible imbalance.
    	 * Negative imbalances (*we* are more loaded than anyone else) will
    	 * be counted as no imbalance for these purposes -- we can't fix that
    	 * by pulling tasks to us. Be careful of negative numbers as they'll
    	 * appear as very large values with unsigned longs.
    	 */
    	if (max_load <= busiest_load_per_task)
    		goto out_balanced;
    
    	/*
    	 * In the presence of smp nice balancing, certain scenarios can have
    	 * max load less than avg load(as we skip the groups at or below
    	 * its cpu_power, while calculating max_load..)
    	 */
    	if (max_load __cpu_power,
    				(avg_load - this_load) * this->__cpu_power)
    			/ SCHED_LOAD_SCALE;//计算一下需要移动的负载量,下面就是一些太细节的东西了,从逻辑上也讲不好是什么道理,所以不进行分析
    
    	/*
    	 * if *imbalance is less than the average load per runnable task
    	 * there is no gaurantee that any tasks will be moved so we'll have
    	 * a think about bumping its value to force at least one task to be
    	 * moved
    	 */
    	if (*imbalance  this_load_per_task)
    				imbn = 1;
    		} else
    			this_load_per_task = cpu_avg_load_per_task(this_cpu);
    
    		if (max_load - this_load + busiest_load_per_task >=
    					busiest_load_per_task * imbn) {
    			*imbalance = busiest_load_per_task;
    			return busiest;
    		}
    
    		/*
    		 * OK, we don't have enough imbalance to justify moving tasks,
    		 * however we may be able to increase total CPU power used by
    		 * moving them.
    		 */
    
    		pwr_now += busiest->__cpu_power *
    				min(busiest_load_per_task, max_load);
    		pwr_now += this->__cpu_power *
    				min(this_load_per_task, this_load);
    		pwr_now /= SCHED_LOAD_SCALE;
    
    		/* Amount of load we'd subtract */
    		tmp = sg_div_cpu_power(busiest,
    				busiest_load_per_task * SCHED_LOAD_SCALE);
    		if (max_load > tmp)
    			pwr_move += busiest->__cpu_power *
    				min(busiest_load_per_task, max_load - tmp);
    
    		/* Amount of load we'd add */
    		if (max_load * busiest->__cpu_power __cpu_power);
    		else
    			tmp = sg_div_cpu_power(this,
    				busiest_load_per_task * SCHED_LOAD_SCALE);
    		pwr_move += this->__cpu_power *
    				min(this_load_per_task, this_load + tmp);
    		pwr_move /= SCHED_LOAD_SCALE;
    
    		/* Move if we gain throughput */
    		if (pwr_move > pwr_now)
    			*imbalance = busiest_load_per_task;
    	}
    
    	return busiest;
    
    out_balanced:
    #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
    	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
    		goto ret;
    
    	if (this == group_leader && group_leader != group_min) {
    		*imbalance = min_load_per_task;
    		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
    			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
    				cpumask_first(sched_group_cpus(group_leader));
    		}
    		return group_min;
    	}
    #endif
    ret:
    	*imbalance = 0;
    	return NULL;
    }
    
    顺着load_balance的调用路线,接下来就要执行find_busiest_queue了,这个函数比较好理解
    /*
     * find_busiest_queue - find the busiest runqueue among the cpus in group.
     */
    static struct rq *
    find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
    		   unsigned long imbalance, const struct cpumask *cpus)
    {
    	struct rq *busiest = NULL, *rq;
    	unsigned long max_load = 0;
    	int i;
    
    	for_each_cpu(i, sched_group_cpus(group)) {
    		unsigned long wl;
    
    		if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中
    			continue;
    
    		rq = cpu_rq(i);
    		wl = weighted_cpuload(i);
    
    		if (rq->nr_running == 1 && wl > imbalance)//如果该cpu上只有一个进程,且其负载比需要移动的负载量大
    			continue;
    
    		if (wl > max_load) {//更新最大值及最忙队列指针
    			max_load = wl;
    			busiest = rq;
    		}
    	}
    
    	return busiest;
    }
    
    再次回到load_balance的调用路线中,这次终于可以实施最终的移动了
    busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu
    	if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作
    		schedstat_inc(sd, lb_nobusyq[idle]);
    		goto out_balanced;
    	}
    
    	BUG_ON(busiest == this_rq);
    
    	schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息
    
    	ld_moved = 0;//是否移动了某些进程的标志
    	if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动
    		/*
    		 * Attempt to move tasks. If find_busiest_group has found
    		 * an imbalance but busiest->nr_running <= 1, the group is
    		 * still unbalanced. ld_moved simply stays zero, so it is
    		 * correctly treated as an imbalance.
    		 */
    		local_irq_save(flags);
    		double_rq_lock(this_rq, busiest);//同时为两个队列加锁,要考虑防死锁,这里的处理是按指针地址大小进行加锁
    		ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。
    				      imbalance, sd, idle, &all_pinned);
    		double_rq_unlock(this_rq, busiest);
    		local_irq_restore(flags);
    
    		/*
    		 * some other cpu did the load balance for us.
    		 */
    		if (ld_moved && this_cpu != smp_processor_id())
    			resched_cpu(this_cpu);
    
    		/* All tasks on this runqueue were pinned by CPU affinity */
    		if (unlikely(all_pinned)) {
    			cpumask_clear_cpu(cpu_of(busiest), cpus);
    			if (!cpumask_empty(cpus))
    				goto redo;
    			goto out_balanced;
    		}
    	}
    
    加锁之后,便进入到了move_tasks中,
    /*
     * move_tasks tries to move up to max_load_move weighted load from busiest to
     * this_rq, as part of a balancing operation within domain "sd".
     * Returns 1 if successful and 0 otherwise.
     *
     * Called with both runqueues locked.
     */
    static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
    		      unsigned long max_load_move,
    		      struct sched_domain *sd, enum cpu_idle_type idle,
    		      int *all_pinned)
    {
    	const struct sched_class *class = sched_class_highest;
    	unsigned long total_load_moved = 0;
    	int this_best_prio = this_rq->curr->prio;
    
    	do {
    		total_load_moved +=
    			class->load_balance(this_rq, this_cpu, busiest,
    				max_load_move - total_load_moved,
    				sd, idle, all_pinned, &this_best_prio);
    		class = class->next;
    
    		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
    			break;
    
    	} while (class && max_load_move > total_load_moved);
    
    	return total_load_moved > 0;
    }
    
    函数主要由一个do-while循环完成,开始时class指向sched_class_highest,而在sched.c里面有有:
     #define sched_class_highest (&rt_sched_class)
    
    也就是说,在循环第一次执行时,会调用rt_sched_class调度类里对应的load_balance函数,去sched_rt.c里面寻找,发现如下:
    	.load_balance		= load_balance_rt,
    
    static unsigned long
    load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
    		unsigned long max_load_move,
    		struct sched_domain *sd, enum cpu_idle_type idle,
    		int *all_pinned, int *this_best_prio)
    {
    	/* don't touch RT tasks */
    	return 0;
    }
    
    可见,这是一个空函数,也就是对于负载平衡,是不会将rt类进程迁移走的,循环只好进入下一个调度类,也就是cfs调度类。去执行它所对应的load_balance函数
    
    #ifdef CONFIG_FAIR_GROUP_SCHED
    static unsigned long
    load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
    		  unsigned long max_load_move,
    		  struct sched_domain *sd, enum cpu_idle_type idle,
    		  int *all_pinned, int *this_best_prio)
    {
    	long rem_load_move = max_load_move;//rem_load_move remain_load_move
    	int busiest_cpu = cpu_of(busiest);//最忙的列队所对应的cpu
    	struct task_group *tg;
    
    	rcu_read_lock();
    	update_h_load(busiest_cpu);//更新一下
    
    	list_for_each_entry_rcu(tg, &task_groups, list) {
    		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
    		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
    		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
    		u64 rem_load, moved_load;
    
    		/*
    		 * empty group
    		 */
    		if (!busiest_cfs_rq->task_weight)
    			continue;
    
    		rem_load = (u64)rem_load_move * busiest_weight;
    		rem_load = div_u64(rem_load, busiest_h_load + 1);
    
    		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
    				rem_load, sd, idle, all_pinned, this_best_prio,
    				tg->cfs_rq[busiest_cpu]);
    
    		if (!moved_load)
    			continue;
    
    		moved_load *= busiest_h_load;
    		moved_load = div_u64(moved_load, busiest_weight + 1);
    
    		rem_load_move -= moved_load;
    		if (rem_load_move < 0)
    			break;
    	}
    	rcu_read_unlock();
    
    	return max_load_move - rem_load_move;
    }
    
    
    update_h_load与之前看到过的更新shares值的函数比较像
     static void update_h_load(long cpu)
     {
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
     }
    
    前面已经说过tg_nop函数是一个空函数,来看一下tg_load_down函数
    /*
     * Compute the cpu's hierarchical load factor for each task group.
     * This needs to be done in a top-down fashion because the load of a child
     * group is a fraction of its parents load.
     */
    static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚
    {
    	unsigned long load;
    	long cpu = (long)data;
    
    	if (!tg->parent) {
    		load = cpu_rq(cpu)->load.weight;
    	} else {
    		load = tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量
    		load *= tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值
    		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
    	}
    
    	tg->cfs_rq[cpu]->h_load = load;
    
    	return 0;
    }
    
    
    上面的代码算下来,就是本层调度组需要移动的负载量=本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight 其实说白了就是按负载比例进行分配。 将本队列中各组需要移动的负载量计算出来以后,就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中
    	update_h_load(busiest_cpu);//更新一下
    
    	list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列
    		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
    		unsigned long busiest_h_load = busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量
    		unsigned long busiest_weight = busiest_cfs_rq->load.weight;//该组的负载
    		u64 rem_load, moved_load;
    
    		/*
    		 * empty group
    		 */
    		if (!busiest_cfs_rq->task_weight)
    			continue;
    
    		rem_load = (u64)rem_load_move * busiest_weight;
    		rem_load = div_u64(rem_load, busiest_h_load + 1);//rem_load=rem_load_move*(busiest_weight)/(busiest_h_load+1)
    
    		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
    				rem_load, sd, idle, all_pinned, this_best_prio,
    				tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了
    
    		if (!moved_load)
    			continue;
    
    		moved_load *= busiest_h_load;
    		moved_load = div_u64(moved_load, busiest_weight + 1);
    
    		rem_load_move -= moved_load;//移动完一个组,将“成果”反馈,看看还是不是需要继续移动下一个组中的进程
    		if (rem_load_move < 0)
    			break;
    	}
    	rcu_read_unlock();
    
    	return max_load_move - rem_load_move;
    
    对于__load_balance_fair,如下:
    static unsigned long
    __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
    		unsigned long max_load_move, struct sched_domain *sd,
    		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
    		struct cfs_rq *cfs_rq)
    {
    	struct rq_iterator cfs_rq_iterator;
    
    	cfs_rq_iterator.start = load_balance_start_fair;
    	cfs_rq_iterator.next = load_balance_next_fair;
    	cfs_rq_iterator.arg = cfs_rq;
    
    	return balance_tasks(this_rq, this_cpu, busiest,
    			max_load_move, sd, idle, all_pinned,
    			this_best_prio, &cfs_rq_iterator);
    }
    
    还需要进入到balance_task中去
    static unsigned long
    balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
    	      unsigned long max_load_move, struct sched_domain *sd,
    	      enum cpu_idle_type idle, int *all_pinned,
    	      int *this_best_prio, struct rq_iterator *iterator)
    {
    	int loops = 0, pulled = 0, pinned = 0;
    	struct task_struct *p;
    	long rem_load_move = max_load_move;
    
    	if (max_load_move == 0)
    		goto out;
    
    	pinned = 1;
    
    	/*
    	 * Start the load-balancing iterator:
    	 */
    	p = iterator->start(iterator->arg);
    next:
    	if (!p || loops++ > sysctl_sched_nr_migrate)
    		goto out;
    
    	if ((p->se.load.weight >> 1) > rem_load_move ||
    	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍,则不考虑移动此进程,如果此进程不能被移动,则同样不考虑移动此进程
    		p = iterator->next(iterator->arg);
    		goto next;
    	}
    
    	pull_task(busiest, p, this_rq, this_cpu);//可以移动,此函数将进程拉到this_cpu的this_rq上来
    	pulled++;//移动进程数加1
    	rem_load_move -= p->se.load.weight;//剩余需要移动负载量减小
    
    	/*
    	 * We only want to steal up to the prescribed amount of weighted load.
    	 */
    	if (rem_load_move > 0) {
    		if (p->prio prio;
    		p = iterator->next(iterator->arg);
    		goto next;
    	}
    out:
    	/*
    	 * Right now, this is one of only two places pull_task() is called,
    	 * so we can safely collect pull_task() stats here rather than
    	 * inside pull_task().
    	 */
    	schedstat_add(sd, lb_gained[idle], pulled);//统计信息
    
    	if (all_pinned)
    		*all_pinned = pinned;
    
    	return max_load_move - rem_load_move;
    }
    
    
    can_migrate_task的代码如下:
    /*
     * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
     */
    static
    int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
    		     struct sched_domain *sd, enum cpu_idle_type idle,
    		     int *all_pinned)
    {
    	/*
    	 * We do not migrate tasks that are:
    	 * 1) running (obviously), or
    	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
    	 * 3) are cache-hot on their current CPU.
    	 */
    	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
    		schedstat_inc(p, se.nr_failed_migrations_affine);
    		return 0;
    	}
    	*all_pinned = 0;
    
    	if (task_running(rq, p)) {
    		schedstat_inc(p, se.nr_failed_migrations_running);
    		return 0;
    	}
    
    	/*
    	 * Aggressive migration if:
    	 * 1) task is cache cold, or
    	 * 2) too many balance attempts have failed.
    	 */
    
    	if (!task_hot(p, rq->clock, sd) ||
    			sd->nr_balance_failed > sd->cache_nice_tries) {
    #ifdef CONFIG_SCHEDSTATS
    		if (task_hot(p, rq->clock, sd)) {
    			schedstat_inc(sd, lb_hot_gained[idle]);
    			schedstat_inc(p, se.nr_forced_migrations);
    		}
    #endif
    		return 1;
    	}
    
    	if (task_hot(p, rq->clock, sd)) {
    		schedstat_inc(p, se.nr_failed_migrations_hot);
    		return 0;
    	}
    	return 1;
    }
    
    注释中写的极为详细,这里不作过多解释。 那么往下,就来看一看pull_task吧
    /*
     * pull_task - move a task from a remote runqueue to the local runqueue.
     * Both runqueues must be locked.
     */
    static void pull_task(struct rq *src_rq, struct task_struct *p,
    		      struct rq *this_rq, int this_cpu)//注释说的很清楚
    {
    	deactivate_task(src_rq, p, 0);//将p从src队列中拿掉
    	set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu,但是还没入新的可执行队列
    	activate_task(this_rq, p, 0);//最终动作,将p加入this_rq队列
    	/*
    	 * Note that idle threads have a prio of MAX_PRIO, for this test
    	 * to be always true for them.
    	 */
    	check_preempt_curr(this_rq, p, 0);
    }
    
    set_task_cpu函数:
    
    void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
    {
    	int old_cpu = task_cpu(p);
    	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
    	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
    		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
    	u64 clock_offset;
    
    	clock_offset = old_rq->clock - new_rq->clock;
    
    	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
    
    #ifdef CONFIG_SCHEDSTATS
    	if (p->se.wait_start)
    		p->se.wait_start -= clock_offset;
    	if (p->se.sleep_start)
    		p->se.sleep_start -= clock_offset;
    	if (p->se.block_start)
    		p->se.block_start -= clock_offset;
    	if (old_cpu != new_cpu) {
    		schedstat_inc(p, se.nr_migrations);
    		if (task_hot(p, old_rq->clock, NULL))
    			schedstat_inc(p, se.nr_forced2_migrations);
    	}
    #endif
    	p->se.vruntime -= old_cfsrq->min_vruntime -
    					 new_cfsrq->min_vruntime;
    
    	__set_task_cpu(p, new_cpu);
    }
    
    __set_task_cpu:
    static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
    {
    	set_task_rq(p, cpu);
    #ifdef CONFIG_SMP
    	/*
    	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
    	 * successfuly executed on another CPU. We must ensure that updates of
    	 * per-task data have been completed by this moment.
    	 */
    	smp_wmb();
    	task_thread_info(p)->cpu = cpu;
    #endif
    }
    
    set_task_rq:
    /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
    static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
    {
    #ifdef CONFIG_FAIR_GROUP_SCHED
    	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
    	p->se.parent = task_group(p)->se[cpu];
    #endif
    
    #ifdef CONFIG_RT_GROUP_SCHED
    	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
    	p->rt.parent = task_group(p)->rt_se[cpu];
    #endif
    }
    
    
    可见,p进程原来属于哪个组,移动后还是属于哪个组,只不过它被移动到了该组在其它cpu上的运行队列中 由move_tasks产生的动作到这里就完了,其实就是按照先算出来的每个组需要移动的负载量,依次从每个组中挑选进程移走。 再次回到load_balance函数中,现在的情况是,通过寻找该调度域中最忙的调度组,以及找到最忙调度组中的最忙cpu,又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移,迁移到了this_cpu上,那么,可以最后检查一下工作了,看下刚才上述那些工作完成的怎么样
    if (!ld_moved) {//如果没有移动进程
    		schedstat_inc(sd, lb_failed[idle]);
    		sd->nr_balance_failed++;
    
    		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {//如果失败次数已经超过cache_nice_tries+2(这个值看名字应该是保证cache hot用的)
    
    			spin_lock_irqsave(&busiest->lock, flags);
    
    			/* don't kick the migration_thread, if the curr
    			 * task on busiest cpu can't be moved to this_cpu
    			 */
    			if (!cpumask_test_cpu(this_cpu,
    					      &busiest->curr->cpus_allowed)) {//找下原因,是不是因为进程被设定了不允许移动到this_cpu上
    				spin_unlock_irqrestore(&busiest->lock, flags);
    				all_pinned = 1;
    				goto out_one_pinned;
    			}
    
    			if (!busiest->active_balance) {
    				busiest->active_balance = 1;
    				busiest->push_cpu = this_cpu;
    				active_balance = 1;
    			}
    			spin_unlock_irqrestore(&busiest->lock, flags);
    			if (active_balance)//实在不行,唤醒migration_thread进程,同步的去移动进程
    				wake_up_process(busiest->migration_thread);
    
    			/*
    			 * We've kicked active balancing, reset the failure
    			 * counter.
    			 */
    			sd->nr_balance_failed = sd->cache_nice_tries+1;
    		}
    	} else
    		sd->nr_balance_failed = 0;
    
    	if (likely(!active_balance)) {
    		/* We were unbalanced, so reset the balancing interval */
    		sd->balance_interval = sd->min_interval;//调整一下平衡周期
    	} else {
    		/*
    		 * If we've begun active balancing, start to back off. This
    		 * case may not be covered by the all_pinned logic if there
    		 * is only 1 task on the busy runqueue (because we don't call
    		 * move_tasks).
    		 */
    		if (sd->balance_interval max_interval)
    			sd->balance_interval *= 2;
    	}
    
    migration_thread是干什么的呢?原来,每个cpu都会绑定一个migration_thread内核线程,专门应对这种情况,至于绑定的方法,那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了,这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。 那么migration_thread都干些什么?在sched.c中有如下函数,在fork migration_thread时,该线程将会执行它:
    /*
     * migration_thread - this is a highprio system thread that performs
     * thread migration by bumping thread off CPU then 'pushing' onto
     * another runqueue.
     */
    static int migration_thread(void *data)
    {
    	int cpu = (long)data;
    	struct rq *rq;
    
    	rq = cpu_rq(cpu);
    	BUG_ON(rq->migration_thread != current);
    
    	set_current_state(TASK_INTERRUPTIBLE);
    	while (!kthread_should_stop()) {
    		struct migration_req *req;
    		struct list_head *head;
    
    		spin_lock_irq(&rq->lock);
    
    		if (cpu_is_offline(cpu)) {
    			spin_unlock_irq(&rq->lock);
    			goto wait_to_die;
    		}
    
    		if (rq->active_balance) {
    			active_load_balance(rq, cpu);
    			rq->active_balance = 0;
    		}
    
    		head = &rq->migration_queue;
    
    		if (list_empty(head)) {
    			spin_unlock_irq(&rq->lock);
    			schedule();
    			set_current_state(TASK_INTERRUPTIBLE);
    			continue;
    		}
    		req = list_entry(head->next, struct migration_req, list);
    		list_del_init(head->next);
    
    		spin_unlock(&rq->lock);
    		__migrate_task(req->task, cpu, req->dest_cpu);
    		local_irq_enable();
    
    		complete(&req->done);
    	}
    	__set_current_state(TASK_RUNNING);
    	return 0;
    
    wait_to_die:
    	/* Wait for kthread_stop */
    	set_current_state(TASK_INTERRUPTIBLE);
    	while (!kthread_should_stop()) {
    		schedule();
    		set_current_state(TASK_INTERRUPTIBLE);
    	}
    	__set_current_state(TASK_RUNNING);
    	return 0;
    }
    
    按刚才的情景,会执行到active_load_balance函数
    /*
     * active_load_balance is run by migration threads. It pushes running tasks
     * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
     * running on each physical CPU where possible, and avoids physical /
     * logical imbalances.
     *
     * Called with busiest_rq locked.
     */
    static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
    {
    	int target_cpu = busiest_rq->push_cpu;
    	struct sched_domain *sd;
    	struct rq *target_rq;
    
    	/* Is there any task to move? */
    	if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) &&
    		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
    				break;
    	}
    
    	if (likely(sd)) {
    		schedstat_inc(sd, alb_count);
    
    		if (move_one_task(target_rq, target_cpu, busiest_rq,
    				  sd, CPU_IDLE))//这里是move_one_task,也就是说只移动一个进程,减小了力度,毕竟是受阻才会执行到这里的
    			schedstat_inc(sd, alb_pushed);
    		else
    			schedstat_inc(sd, alb_failed);
    	}
    	double_unlock_balance(busiest_rq, target_rq);
    }
    
    
    在该进程被唤醒之前,push_cpu就已经被设置了load_balance里的this_cpu,也就是说,当时移动不了,那过后再移动,但是,目标cpu还是不变的 此外,migration_thread线程还会检查rq中是否有提交上来的需要转移的进程,如果有,一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢?用cscope一路查下去,发现是在exec中,也就是sys_execve系统调用的执行过程中。    
  • 相关阅读:
    不会写研发部门OKR?来这里看看吧
    HR 必须了解的绩效考核
    成功的OKR复盘会需要注意什么
    effective解读-第一条 静态工厂创建对象代替构造器
    灾难恢复:邮箱数据库操作总结:整理 查询邮箱数据库大小和空白数据大小(重要文档)
    Exchange传输队列queue数据库mail.que文件越来越大(重要文档)
    Exchange2016邮件流(重要文档)
    Exchange2016服务器使用到的9个端口
    VMware EXSI 启用vMotion
    Exchange2016DAG的配置
  • 原文地址:https://www.cnblogs.com/yangce/p/2910096.html
Copyright © 2011-2022 走看看