从现在开始来分析和负载平衡有关的策略。
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
sched_clock_tick();
spin_lock(&rq->lock);
update_rq_clock(rq);
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);//检查当前cpu运行队列是否为空(只有idle进程)
trigger_load_balance(rq, cpu);
#endif
}
可见,在每次处理时钟中断时,在最后会检查一下是否需要进行一次负载平衡。
进入到trigger_load_balance中去,从名字就可以猜出个大概。
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*
* In case of CONFIG_NO_HZ, this is the place where we nominate a new
* idle load balancing owner or decide to stop the periodic load balancing,
* if the whole system is idle.
*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
* scheduler tick, then check if we need to nominate new idle
* load balancer.
*/
if (rq->in_nohz_recently && !rq->idle_at_tick) {
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) {
cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
if (atomic_read(&nohz.load_balancer) == -1) {
/*
* simple selection for now: Nominate the
* first cpu in the nohz list to be the next
* ilb owner.
*
* TBD: Traverse the sched domains and nominate
* the nearest cpu in the nohz.cpu_mask.
*/
int ilb = cpumask_first(nohz.cpu_mask);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
}
}
/*
* If this cpu is idle and doing idle load balancing for all the
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
resched_cpu(cpu);
return;
}
/*
* If this cpu is idle and the idle load balancing is done by
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
}
忽略掉CONFIG_NO_HZ的部分,可以看到,这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大,如果值大的话,会进一步调用raise_softirq提交一个软中断。提交的过程很简单,就是把SCHED_SOFTIRQ对应的位置位,处理软中断时检查是否位,如果置位调用相应的软中断处理函数。
用cscope在源码中搜索,发现有如下语句:
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#endif
呵,这个软中断还是特别为SMP架构准备的呢~
顺着这个线索,去查看run_rebalance_domains的实现
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_at_tick ?
CPU_IDLE : CPU_NOT_IDLE;
rebalance_domains(this_cpu, idle);
#ifdef CONFIG_NO_HZ
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
struct rq *rq;
int balance_cpu;
for_each_cpu(balance_cpu, nohz.cpu_mask) {
if (balance_cpu == this_cpu)
continue;
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched())
break;
rebalance_domains(balance_cpu, CPU_IDLE);
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
}
#endif
}
忽略CONFIG_NO_HZ,那么这个函数就是根据当前cpu的负载状态(为idle进程还是其它)确定idle参数,然后调用rebalance_domains
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
cpumask_var_t tmp;
/* Fails alloc? Rebalancing probably not a priority right now. */
if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
return;
for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域
if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡,则跳过
continue;
interval = sd->balance_interval;//得到该调度域的平衡周期
if (idle != CPU_IDLE)
interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数
if (unlikely(!interval))
interval = 1;
if (interval > HZ*NR_CPUS/10)//继续修正
interval = HZ*NR_CPUS/10;
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了
if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
* not idle.
*/
idle = CPU_NOT_IDLE;//注释已经写的很清楚
}
sd->last_balance = jiffies;//更新最后一次平衡的时间
}
if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {//设置下一次进行平衡操作的时间
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!balance)
break;
}
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
free_cpumask_var(tmp);
}
重点就是load_balance了
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance, struct cpumask *cpus)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
cpumask_setall(cpus);//先将所有cpu置位
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as CPU_IDLE, instead of
* portraying it as CPU_NOT_IDLE.
*/
if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中,SHARE_CPUPOWER不会出现
sd_idle = 1;
schedstat_inc(sd, lb_count[idle]);//更新统计信息
redo:
update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);//找也该调度域中最忙的调度组
if (*balance == 0)
goto out_balanced;
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(group, idle, imbalance, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
wake_up_process(busiest->migration_thread);
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval max_interval)
sd->balance_interval *= 2;
}
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval balance_interval max_interval))
sd->balance_interval *= 2;
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
else
ld_moved = 0;
out:
if (ld_moved)
update_shares(sd);
return ld_moved;
}
其中update_shares有必要去看一下
static void update_shares(struct sched_domain *sd)
{
u64 now = cpu_clock(raw_smp_processor_id());
s64 elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认,是不是需要将此调度域中的每个进程组的share值更新
sd->last_update = now;
walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
如果真的需要去更新该调度域的各个进程组的share值的话,将调用wsalk_tg_tree进行更新操作,tg_nop,shares_up是两个函数指针,其中在这里,tg_nop进行空操作,shares_up将进行真正的更新操作。
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
rcu_read_lock();
parent = &root_task_group;
down:
ret = (*down)(parent, data);
if (ret)
goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret)
goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out_unlock:
rcu_read_unlock();
return ret;
}
代码比较难读,不如自己在纸上画个图,实际走一遍,就看清楚了,这里实际上就是从下而上,从左到右,依次更新每个调度组的share值,具体的更新方法在shares_up函数中体现。
//注释中也说明了刚才的遍历方法
/*
* Re-compute the task group their per cpu shares over the given domain.
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long weight, rq_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
*/
weight = tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加
if (!weight)//如果在该cpu上没有负载,就要分一些过来了,注意与下一条语句联系
weight = NICE_0_LOAD;
tg->cfs_rq[i]->rq_weight = weight;//注意这里是cfs_rq的rq_weight,
rq_weight += weight;//计算总的rq_weight
shares += tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加
}
//进行一下修正
if ((!shares && rq_weight) || shares > tg->shares)
shares = tg->shares;
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说,这个条件是满足的,
shares = tg->shares;//shares值直接变成了该调度组的shares值了
for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后,再来一次循环,这次要更新了
update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}
再来看update_group_shares_cpu
/*
* Calculate and set the cpu's group shares.
*/
static void//注意这里的参数,tg就是在刚才树的遍历中遍历到的组,cpu是该调度域中的cpu i,sd_shares是该调度组的shares值,sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和
update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
{
unsigned long shares;
unsigned long rq_weight;
if (!tg->se[cpu])
return;
rq_weight = tg->cfs_rq[cpu]->rq_weight;
/*
* Sum shares * rq_weight
* shares = -----------------------
* Sum rq_weight
*
*/
shares = (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正
if (abs(shares - tg->se[cpu]->load.weight) >
sysctl_sched_shares_thresh) {//为了避免操作过于频繁,只有结果大于一个可控值时,才进行更新。
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
tg->cfs_rq[cpu]->shares = shares;
__set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去
spin_unlock_irqrestore(&rq->lock, flags);
}
}
注意上面的计算方法,是说对组内的cpu来讲,共同来分担该组的shares值,具体的分担方法是,按比例来,哪个cpu的负载占所有cpu负载的百分比大,哪个cpu分得的shares值也就大一些,优先级就大一些,运行的时候就会多一些筹码,
这里需要返回去看pick_up_next的部分代码
__set_se_shares代码如下
static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
int on_rq;
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
se->load.weight = shares;
se->load.inv_weight = 0;
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
}
很好理解,先移出可执行队列,更新其负载后,再移入可执行队列。
update_shares到这里就分析完了,注意update_shares的执行时间是在已经确定需要进行负载平衡,但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况,有助于下面的调度组以及进程的选择。
返回刚才的load_balance函数,继续往下进行。
redo:
update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);//找到该调度域中最忙的调度组
if (*balance == 0)
goto out_balanced;
if (!group) {//如果都不太忙,当然不需要平衡操作
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu
if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息
ld_moved = 0;//是否移动了某些进程的标志
if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
}
先看 find_busiest_group函数,这个函数比较长
/*
* find_busiest_group finds and returns the busiest CPU group within the
* domain. It calculates and returns the amount of weighted load which
* should be moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
unsigned long min_nr_running = ULONG_MAX;
struct sched_group *group_min = NULL, *group_leader = NULL;
#endif
max_load = this_load = total_load = total_pwr = 0;
busiest_load_per_task = busiest_nr_running = 0;
this_load_per_task = this_nr_running = 0;
if (idle == CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值,这个值在下面寻找最忙调度组(不是进程组)时会作为一个重要指标
load_idx = sd->busy_idx;//busy_idx默认为3
else if (idle == CPU_NEWLY_IDLE)
load_idx = sd->newidle_idx;//newidle_idx为2
else
load_idx = sd->idle_idx;idle_idx为1
do {//从这里一直到while(group!=sd->groups)是一个大循环,其目的就是遍布这个调度域中所有的调度组,找出最忙的那个,其中,this_cpu所属的调度组不参与与其它调度组的竞争
unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
int local_group;
int i;
int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组,那么将local_group置位
if (local_group)
balance_cpu = cpumask_first(sched_group_cpus(group));//如果正在处理"local_group",那么将balance_cpu暂定为该组中第一个cpu
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
sum_avg_load_per_task = avg_load_per_task = 0;
max_cpu_load = 0;
min_cpu_load = ~0UL;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpu
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) {//如果是本地组,且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正
if (idle_cpu(i) && !first_idle_cpu) {
first_idle_cpu = 1;
balance_cpu = i;//将balance_cpu置为i,仔细考虑下,这里的逻辑就是说,如果本地组中有空闲cpu,那么就将第一个空闲cpu作为balance_cpu,否则,将该组中第一个cpu作为balance_cpu
}
load = target_load(i, load_idx);//累加计算该组的负载,增加的数目要根据前面确定的load_idx来确定
} else {//如果当前组不是本地组
load = source_load(i, load_idx);//同上
if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载,或者小于已经找到的最小负载,则更新最大/最小值
max_cpu_load = load;
if (min_cpu_load > load)
min_cpu_load = load;
}
avg_load += load;//根据load_idx计算出来的负载之和
sum_nr_running += rq->nr_running;//组内各个cpu上可运行队列中进程数目之和
sum_weighted_load += weighted_cpuload(i);//该组内当前所有cpu的负载之和,注意这里是当前的,和avg_load不同,因为avg_load的计算涉及到历史值,也就是和load_idx有关
sum_avg_load_per_task += cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载
}
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
total_load += avg_load;//调度域的总负载
total_pwr += group->__cpu_power;//这个cpu_power还没弄清是怎么回事
/* Adjust by relative CPU power of the group */
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task = sg_div_cpu_power(group,
sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样,修正该组的avg_load_per_task值
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍,则__group_imb(imbalance)置1,下面会看到它的作用
__group_imb = 1;
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {//如果是本地组,只更新this相关的变量,并不更新busiest指针的指向
this_load = avg_load;
this = group;
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load > max_load &&
(sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况,或者组内的进程数目已经超过了该组的能力,且该组的平均负载大于已知的其它组的最大平均负载
max_load = avg_load;//更新最大值
busiest = group;//更新指针指向
busiest_nr_running = sum_nr_running;//更新最忙组中的进程数目
busiest_load_per_task = sum_weighted_load;
group_imb = __group_imb;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码,直到整个do-while循环结束
/*
* Busy processors will not participate in power savings
* balance.
*/
if (idle == CPU_NOT_IDLE ||
!(sd->flags & SD_POWERSAVINGS_BALANCE))
goto group_next;
/*
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if (local_group && (this_nr_running >= group_capacity ||
!this_nr_running))
power_savings_balance = 0;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if (!power_savings_balance || sum_nr_running >= group_capacity
|| !sum_nr_running)
goto group_next;
/*
* Calculate the group which has the least non-idle load.
* This is the group from where we need to pick up the load
* for saving power
*/
if ((sum_nr_running
cpumask_first(sched_group_cpus(group_min)))) {
group_min = group;
min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
sum_nr_running;
}
/*
* Calculate the group which is almost near its
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if (sum_nr_running leader_nr_running ||
(sum_nr_running == leader_nr_running &&
cpumask_first(sched_group_cpus(group)) next;
} while (group != sd->groups);//至此,已经将该调度域中所有调度组全部遍历完,如果有符合条件的最忙调度组的话,busiest已经指向它
if (!busiest || this_load >= max_load || busiest_nr_running == 0)//没有符合条件的,或者本地调度组比找到的那一组还要忙,或者最忙的组中已经没有进程,则不需要平衡
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
if (this_load >= avg_load ||
100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件,当前组负载大于平均平均负载,或者最大负载与当前组负载之比小于某个值
goto out_balanced;
busiest_load_per_task /= busiest_nr_running;//最忙组中每个进程的平均负载
if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍,则进行一下修正
busiest_load_per_task = min(busiest_load_per_task, avg_load);
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load, as either of these
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
if (max_load <= busiest_load_per_task)
goto out_balanced;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
if (max_load __cpu_power,
(avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;//计算一下需要移动的负载量,下面就是一些太细节的东西了,从逻辑上也讲不好是什么道理,所以不进行分析
/*
* if *imbalance is less than the average load per runnable task
* there is no gaurantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (*imbalance this_load_per_task)
imbn = 1;
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
pwr_now += busiest->__cpu_power *
min(busiest_load_per_task, max_load);
pwr_now += this->__cpu_power *
min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = sg_div_cpu_power(busiest,
busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
if (max_load * busiest->__cpu_power __cpu_power);
else
tmp = sg_div_cpu_power(this,
busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += this->__cpu_power *
min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
*imbalance = busiest_load_per_task;
}
return busiest;
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
if (this == group_leader && group_leader != group_min) {
*imbalance = min_load_per_task;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
cpumask_first(sched_group_cpus(group_leader));
}
return group_min;
}
#endif
ret:
*imbalance = 0;
return NULL;
}
顺着load_balance的调用路线,接下来就要执行find_busiest_queue了,这个函数比较好理解
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long imbalance, const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long wl;
if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中
continue;
rq = cpu_rq(i);
wl = weighted_cpuload(i);
if (rq->nr_running == 1 && wl > imbalance)//如果该cpu上只有一个进程,且其负载比需要移动的负载量大
continue;
if (wl > max_load) {//更新最大值及最忙队列指针
max_load = wl;
busiest = rq;
}
}
return busiest;
}
再次回到load_balance的调用路线中,这次终于可以实施最终的移动了
busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu
if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息
ld_moved = 0;//是否移动了某些进程的标志
if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);//同时为两个队列加锁,要考虑防死锁,这里的处理是按指针地址大小进行加锁
ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
}
加锁之后,便进入到了move_tasks中,
/*
* move_tasks tries to move up to max_load_move weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
*
* Called with both runqueues locked.
*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
const struct sched_class *class = sched_class_highest;
unsigned long total_load_moved = 0;
int this_best_prio = this_rq->curr->prio;
do {
total_load_moved +=
class->load_balance(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
sd, idle, all_pinned, &this_best_prio);
class = class->next;
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
}
函数主要由一个do-while循环完成,开始时class指向sched_class_highest,而在sched.c里面有有:
#define sched_class_highest (&rt_sched_class)
也就是说,在循环第一次执行时,会调用rt_sched_class调度类里对应的load_balance函数,去sched_rt.c里面寻找,发现如下:
.load_balance = load_balance_rt,
static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
/* don't touch RT tasks */
return 0;
}
可见,这是一个空函数,也就是对于负载平衡,是不会将rt类进程迁移走的,循环只好进入下一个调度类,也就是cfs调度类。去执行它所对应的load_balance函数
#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
long rem_load_move = max_load_move;//rem_load_move remain_load_move
int busiest_cpu = cpu_of(busiest);//最忙的列队所对应的cpu
struct task_group *tg;
rcu_read_lock();
update_h_load(busiest_cpu);//更新一下
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
u64 rem_load, moved_load;
/*
* empty group
*/
if (!busiest_cfs_rq->task_weight)
continue;
rem_load = (u64)rem_load_move * busiest_weight;
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
rem_load, sd, idle, all_pinned, this_best_prio,
tg->cfs_rq[busiest_cpu]);
if (!moved_load)
continue;
moved_load *= busiest_h_load;
moved_load = div_u64(moved_load, busiest_weight + 1);
rem_load_move -= moved_load;
if (rem_load_move < 0)
break;
}
rcu_read_unlock();
return max_load_move - rem_load_move;
}
update_h_load与之前看到过的更新shares值的函数比较像
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
前面已经说过tg_nop函数是一个空函数,来看一下tg_load_down函数
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚
{
unsigned long load;
long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量
load *= tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
tg->cfs_rq[cpu]->h_load = load;
return 0;
}
上面的代码算下来,就是本层调度组需要移动的负载量=本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight
其实说白了就是按负载比例进行分配。
将本队列中各组需要移动的负载量计算出来以后,就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中
update_h_load(busiest_cpu);//更新一下
list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量
unsigned long busiest_weight = busiest_cfs_rq->load.weight;//该组的负载
u64 rem_load, moved_load;
/*
* empty group
*/
if (!busiest_cfs_rq->task_weight)
continue;
rem_load = (u64)rem_load_move * busiest_weight;
rem_load = div_u64(rem_load, busiest_h_load + 1);//rem_load=rem_load_move*(busiest_weight)/(busiest_h_load+1)
moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
rem_load, sd, idle, all_pinned, this_best_prio,
tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了
if (!moved_load)
continue;
moved_load *= busiest_h_load;
moved_load = div_u64(moved_load, busiest_weight + 1);
rem_load_move -= moved_load;//移动完一个组,将“成果”反馈,看看还是不是需要继续移动下一个组中的进程
if (rem_load_move < 0)
break;
}
rcu_read_unlock();
return max_load_move - rem_load_move;
对于__load_balance_fair,如下:
static unsigned long
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
struct cfs_rq *cfs_rq)
{
struct rq_iterator cfs_rq_iterator;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
cfs_rq_iterator.arg = cfs_rq;
return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
this_best_prio, &cfs_rq_iterator);
}
还需要进入到balance_task中去
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct rq_iterator *iterator)
{
int loops = 0, pulled = 0, pinned = 0;
struct task_struct *p;
long rem_load_move = max_load_move;
if (max_load_move == 0)
goto out;
pinned = 1;
/*
* Start the load-balancing iterator:
*/
p = iterator->start(iterator->arg);
next:
if (!p || loops++ > sysctl_sched_nr_migrate)
goto out;
if ((p->se.load.weight >> 1) > rem_load_move ||
!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍,则不考虑移动此进程,如果此进程不能被移动,则同样不考虑移动此进程
p = iterator->next(iterator->arg);
goto next;
}
pull_task(busiest, p, this_rq, this_cpu);//可以移动,此函数将进程拉到this_cpu的this_rq上来
pulled++;//移动进程数加1
rem_load_move -= p->se.load.weight;//剩余需要移动负载量减小
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
if (rem_load_move > 0) {
if (p->prio prio;
p = iterator->next(iterator->arg);
goto next;
}
out:
/*
* Right now, this is one of only two places pull_task() is called,
* so we can safely collect pull_task() stats here rather than
* inside pull_task().
*/
schedstat_add(sd, lb_gained[idle], pulled);//统计信息
if (all_pinned)
*all_pinned = pinned;
return max_load_move - rem_load_move;
}
can_migrate_task的代码如下:
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
schedstat_inc(p, se.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
if (!task_hot(p, rq->clock, sd) ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, rq->clock, sd)) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
#endif
return 1;
}
if (task_hot(p, rq->clock, sd)) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
return 1;
}
注释中写的极为详细,这里不作过多解释。
那么往下,就来看一看pull_task吧
/*
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
static void pull_task(struct rq *src_rq, struct task_struct *p,
struct rq *this_rq, int this_cpu)//注释说的很清楚
{
deactivate_task(src_rq, p, 0);//将p从src队列中拿掉
set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu,但是还没入新的可执行队列
activate_task(this_rq, p, 0);//最终动作,将p加入this_rq队列
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
check_preempt_curr(this_rq, p, 0);
}
set_task_cpu函数:
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
int old_cpu = task_cpu(p);
struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
struct cfs_rq *old_cfsrq = task_cfs_rq(p),
*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
u64 clock_offset;
clock_offset = old_rq->clock - new_rq->clock;
trace_sched_migrate_task(p, task_cpu(p), new_cpu);
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
p->se.wait_start -= clock_offset;
if (p->se.sleep_start)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
if (old_cpu != new_cpu) {
schedstat_inc(p, se.nr_migrations);
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
}
#endif
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
__set_task_cpu(p, new_cpu);
}
__set_task_cpu:
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
#ifdef CONFIG_SMP
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfuly executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
#endif
}
set_task_rq:
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
#endif
}
可见,p进程原来属于哪个组,移动后还是属于哪个组,只不过它被移动到了该组在其它cpu上的运行队列中
由move_tasks产生的动作到这里就完了,其实就是按照先算出来的每个组需要移动的负载量,依次从每个组中挑选进程移走。
再次回到load_balance函数中,现在的情况是,通过寻找该调度域中最忙的调度组,以及找到最忙调度组中的最忙cpu,又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移,迁移到了this_cpu上,那么,可以最后检查一下工作了,看下刚才上述那些工作完成的怎么样
if (!ld_moved) {//如果没有移动进程
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {//如果失败次数已经超过cache_nice_tries+2(这个值看名字应该是保证cache hot用的)
spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {//找下原因,是不是因为进程被设定了不允许移动到this_cpu上
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)//实在不行,唤醒migration_thread进程,同步的去移动进程
wake_up_process(busiest->migration_thread);
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;//调整一下平衡周期
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval max_interval)
sd->balance_interval *= 2;
}
migration_thread是干什么的呢?原来,每个cpu都会绑定一个migration_thread内核线程,专门应对这种情况,至于绑定的方法,那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了,这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。
那么migration_thread都干些什么?在sched.c中有如下函数,在fork migration_thread时,该线程将会执行它:
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
* another runqueue.
*/
static int migration_thread(void *data)
{
int cpu = (long)data;
struct rq *rq;
rq = cpu_rq(cpu);
BUG_ON(rq->migration_thread != current);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
struct migration_req *req;
struct list_head *head;
spin_lock_irq(&rq->lock);
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
goto wait_to_die;
}
if (rq->active_balance) {
active_load_balance(rq, cpu);
rq->active_balance = 0;
}
head = &rq->migration_queue;
if (list_empty(head)) {
spin_unlock_irq(&rq->lock);
schedule();
set_current_state(TASK_INTERRUPTIBLE);
continue;
}
req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
spin_unlock(&rq->lock);
__migrate_task(req->task, cpu, req->dest_cpu);
local_irq_enable();
complete(&req->done);
}
__set_current_state(TASK_RUNNING);
return 0;
wait_to_die:
/* Wait for kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
按刚才的情景,会执行到active_load_balance函数
/*
* active_load_balance is run by migration threads. It pushes running tasks
* off the busiest CPU onto idle CPUs. It requires at least 1 task to be
* running on each physical CPU where possible, and avoids physical /
* logical imbalances.
*
* Called with busiest_rq locked.
*/
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{
int target_cpu = busiest_rq->push_cpu;
struct sched_domain *sd;
struct rq *target_rq;
/* Is there any task to move? */
if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
if (likely(sd)) {
schedstat_inc(sd, alb_count);
if (move_one_task(target_rq, target_cpu, busiest_rq,
sd, CPU_IDLE))//这里是move_one_task,也就是说只移动一个进程,减小了力度,毕竟是受阻才会执行到这里的
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
double_unlock_balance(busiest_rq, target_rq);
}
在该进程被唤醒之前,push_cpu就已经被设置了load_balance里的this_cpu,也就是说,当时移动不了,那过后再移动,但是,目标cpu还是不变的
此外,migration_thread线程还会检查rq中是否有提交上来的需要转移的进程,如果有,一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢?用cscope一路查下去,发现是在exec中,也就是sys_execve系统调用的执行过程中。