zoukankan      html  css  js  c++  java
  • Linux中的进程调度(五)

      前面主要分析的是新建一个进程时,内核在调度方面进行的处理。 现在来看当时钟中断时会发生的动作。 当时钟中断发生时,顺着中断处理程序调用路线,会调用到scheduler_tick()
    /*
     * This function gets called by the timer code, with HZ frequency.
     * We call it with interrupts disabled.
     *
     * It also gets called by the fork code, when changing the parent's
     * timeslices.
     */
    void scheduler_tick(void)
    {
    	int cpu = smp_processor_id();//得到当前cpu的id
    	struct rq *rq = cpu_rq(cpu);//得到该id所对应的cpu所拥有的可运行队列,上述两个变量都是per_cpu变量(抽时间专门写一篇关于per_cpu变量的日志)
    	struct task_struct *curr = rq->curr;//得到当前进程
    
    	sched_clock_tick();//如果有定义CONFIG_HAVE_UNSTABLE_SCHED_CLOCK,则此函数为空,否则此函数会统计一些调度器信息,与调度特定功能关系不大,不作分析
    
    	spin_lock(&rq->lock);//加自旋锁,因为整个系统可能不只有一个时钟,如果此时其它cpu空闲,则会从其它cpu的可运行队列中挑选进程"pull"过去,所以加锁防止竞争
    	update_rq_clock(rq);//更新可运行队列的时间,具体实现太过于底层,大概是从cpu的某个寄存器中将晶振值读出
    	update_cpu_load(rq);//更新cpu负载信息,具体实现好像没有很严格的逻辑,有点经验的味道,以后如果用到再回过来仔细分析
    	curr->sched_class->task_tick(rq, curr, 0);//对于curr进程所在的sched_class,执行其对应的时钟处理函数
    	spin_unlock(&rq->lock);//释放自旋锁
    
    #ifdef CONFIG_SMP//与负载平衡有关,在后面重点分析负载平衡的日志中分析
    	rq->idle_at_tick = idle_cpu(cpu);
    	trigger_load_balance(rq, cpu);
    #endif
    }
    对于属于cfs调度类的进程,其task_tick函数为task_tick_fair()
    /*
     * scheduler tick hitting a task of our scheduling class:
     */
    static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    {
    	struct cfs_rq *cfs_rq;
    	struct sched_entity *se = &curr->se;
    
    	for_each_sched_entity(se) {//与组调度有关,最新版本的内核已经抛弃了组调度的概念,这里可以认为这个循环只执行一次
    		cfs_rq = cfs_rq_of(se);
    		entity_tick(cfs_rq, se, queued);//真正的处理是这个函数完成的
    	}
    }
     
    static void
    entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
    {
    	/*
    	 * Update run-time statistics of the 'current'.
    	 */
    	update_curr(cfs_rq);//更新当前进程的虚拟时间,在前面的日志中已经有过分析
    
    #ifdef CONFIG_SCHED_HRTICK//忽略
    	/*
    	 * queued ticks are scheduled to match the slice, so don't bother
    	 * validating it and just reschedule.
    	 */
    	if (queued) {
    		resched_task(rq_of(cfs_rq)->curr);
    		return;
    	}
    	/*
    	 * don't let the period tick interfere with the hrtick preemption
    	 */
    	if (!sched_feat(DOUBLE_TICK) &&
    			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
    		return;
    #endif
    
    	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))//检查当前进程是否可以被抢占
    		check_preempt_tick(cfs_rq, curr);
    }
    接下来到重点了
    /*
     * Preempt the current task with a newly woken task if needed:
     */
    static void
    check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
    {
    	unsigned long ideal_runtime, delta_exec;
    
    	ideal_runtime = sched_slice(cfs_rq, curr);//计算当前进程应该运行的实际时间(wall-time)
    	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
    	if (delta_exec > ideal_runtime)//如果运行时间已经超过分配值,置调度位,在时钟中断返回用户态时将执行调度
    		resched_task(rq_of(cfs_rq)->curr);
    }
    时钟中断经常发生,将每个进程所应得的运行时间直接存入进程结构体不行么?不是会更省时间么?是的,以前的内核中确实是这样做的,可是这样做有一个缺点,就是一个进程的可运行时间在创建之初就已被确定(O(1)调度算法中虽然可以根据进程行为启发的去调整时间片,但本质上还没有解决问题),每次都去计算的方法,则会根据队列中的进程数量,动态的去分配时间,更为合理(因为如果进程数目一直在上升,显然调度周期需要适当的缩短)。至于时间的计算,比较简单,只给出代码,不作注释,但是,这里为什么要用实际时间呢?用虚拟时间不行么?
    /*
     * We calculate the wall-time slice from the period by taking a part
     * proportional to the weight.
     *
     * s = p*P[w/rw]
     */
    static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	unsigned long nr_running = cfs_rq->nr_running;
    
    	if (unlikely(!se->on_rq))
    		nr_running++;
    
    	return calc_delta_weight(__sched_period(nr_running), se);
    }
    /*
     * delta *= P[w / rw]
     */
    static inline unsigned long
    calc_delta_weight(unsigned long delta, struct sched_entity *se)
    {
    	for_each_sched_entity(se) {
    		delta = calc_delta_mine(delta,
    				se->load.weight, &cfs_rq_of(se)->load);
    	}
    
    	return delta;
    }
    可见中断处理的过程其实很简单,主要就是更新当前进程的运行时间信息,然后与理想值进行比较,如果发现已经超时就置调度位,待从内核态回到用户态时执行调度。 再来看若进程主动调用schedule放弃cpu,也就是显式调用时的动作。
    /*
     * schedule() is the main scheduler function.
     */
    asmlinkage void __sched schedule(void)
    {
    	struct task_struct *prev, *next;
    	unsigned long *switch_count;
    	struct rq *rq;
    	int cpu;
    
    need_resched:
    	preempt_disable();
    	cpu = smp_processor_id();
    	rq = cpu_rq(cpu);
    	rcu_qsctr_inc(cpu);//这个没弄清楚是什么作用,不过可以知道是个per cpu变量,查了下资料也没有查到啥,希望有高人解释
    	prev = rq->curr;
    	switch_count = &prev->nivcsw;
    
    	release_kernel_lock(prev);
    need_resched_nonpreemptible:
    
    	schedule_debug(prev);
    
    	if (sched_feat(HRTICK))
    		hrtick_clear(rq);
    
    	spin_lock_irq(&rq->lock);
    	update_rq_clock(rq);
    	clear_tsk_need_resched(prev);
    
    	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
    		if (unlikely(signal_pending_state(prev->state, prev)))//如果有信号发送到当前进程,那么再给它一次机会,先不将它移出可运行队列
    			prev->state = TASK_RUNNING;
    		else
    			deactivate_task(rq, prev, 1);//将进程移出可运行队列
    		switch_count = &prev->nvcsw;
    	}
    
    #ifdef CONFIG_SMP
    	if (prev->sched_class->pre_schedule)//对于cfs调度类,此函数不存在
    		prev->sched_class->pre_schedule(rq, prev);
    #endif
    
    	if (unlikely(!rq->nr_running))//如果队列中已经没有进程,那么执行一次平衡操作
    		idle_balance(cpu, rq);
    
    	prev->sched_class->put_prev_task(rq, prev);//
    	next = pick_next_task(rq, prev);
    
    	if (likely(prev != next)) {
    		sched_info_switch(prev, next);
    
    		rq->nr_switches++;
    		rq->curr = next;
    		++*switch_count;
    
    		context_switch(rq, prev, next); /* unlocks the rq */
    		/*
    		 * the context switch might have flipped the stack from under
    		 * us, hence refresh the local variables.
    		 */
    		cpu = smp_processor_id();
    		rq = cpu_rq(cpu);
    	} else
    		spin_unlock_irq(&rq->lock);
    
    	if (unlikely(reacquire_kernel_lock(current) < 0))
    		goto need_resched_nonpreemptible;
    
    	preempt_enable_no_resched();
    	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
    		goto need_resched;
    }
    EXPORT_SYMBOL(schedule);
    deactivate_task的代码如下:
     /*
      * deactivate_task - remove a task from the runqueue.
      */
     static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
     {
         if (p->state == TASK_UNINTERRUPTIBLE)
             rq->nr_uninterruptible++;//更新统计信息
    
         dequeue_task(rq, p, sleep);//调用对应调度类的相应函数,将进程移出
         dec_nr_running(p, rq);//更新可运行进程数目
     }
    dequeue_task的代码如下:
     static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
     {
         if (sleep && p->se.last_wakeup) {//等分析被唤醒时的情景时再来看
             update_avg(&p->se.avg_overlap,
                    p->se.sum_exec_runtime - p->se.last_wakeup);
             p->se.last_wakeup = 0;
         }
    
         sched_info_dequeued(p);//条件编译,对调度特性无影响
         p->sched_class->dequeue_task(rq, p, sleep);//真正的要删除了
         p->se.on_rq = 0;//标记为该进程已不在可运行队列中
     }
    对于cfs调度类,dequeue_task所对应的函数为dequeue_task_fair(从命名规则也可以猜出应该这个这函数:)
    /*
     * The dequeue_task method is called before nr_running is
     * decreased. We remove the task from the rbtree and
     * update the fair scheduling stats:
     */
    static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
    {
    	struct cfs_rq *cfs_rq;
    	struct sched_entity *se = &p->se;
    
    	for_each_sched_entity(se) {//如果不考虑组调度的情况(新版本已经没有组调度的概念),这个循环只会执行一次
    		cfs_rq = cfs_rq_of(se);
    		dequeue_entity(cfs_rq, se, sleep);..终于找到了删除动作
    		/* Don't dequeue parent if it has other entities besides us */
    		if (cfs_rq->load.weight)
    			break;
    		sleep = 1;
    	}
    
    	hrtick_update(rq);
    }
    继续往里走,看dequeue_entity的代码
    static void
    dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
    {
    	/*
    	 * Update run-time statistics of the 'current'.
    	 */
    	update_curr(cfs_rq);//更新运行信息
    
    	update_stats_dequeue(cfs_rq, se);//更新进程的一些信息,如果用到再仔细分析
    	if (sleep) {
    #ifdef CONFIG_SCHEDSTATS
    		if (entity_is_task(se)) {
    			struct task_struct *tsk = task_of(se);
    
    			if (tsk->state & TASK_INTERRUPTIBLE)
    				se->sleep_start = rq_of(cfs_rq)->clock;
    			if (tsk->state & TASK_UNINTERRUPTIBLE)
    				se->block_start = rq_of(cfs_rq)->clock;
    		}
    #endif
    	}
    
    	clear_buddies(cfs_rq, se);
    
    	if (se != cfs_rq->curr)
    		__dequeue_entity(cfs_rq, se);
    	account_entity_dequeue(cfs_rq, se);//主要更新负载信息
    	update_min_vruntime(cfs_rq);
    }
    clear_buddies这个函数会和别的概念有联系,等下单独分析。 重点看这句
    if (se != cfs_rq->curr)
            __dequeue_entity(cfs_rq, se);
    嗯?不对吧?se不就是前面传过来的的prev么?也就是当前cpu上正在运行的进程(不一定就是cfs里的current进程),如果不是,才将其从红黑树里移除? 删除之后,更新cpu的负载信息,更新cfs队列的负载信息,更新队列的可运行进程数目,将被删除进程标记为已不在可运行队列中
      static void
      account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
      {
          update_load_sub(&cfs_rq->load, se->load.weight);
          if (!parent_entity(se))
              dec_cpu_load(rq_of(cfs_rq), se->load.weight);
          if (entity_is_task(se)) {
              add_cfs_task_weight(cfs_rq, -se->load.weight);
              list_del_init(&se->group_node);
          }
          cfs_rq->nr_running--;
          se->on_rq = 0;
      }
    转了这么远后,回来继续schedule()函数的执行,也就是说,将当前进程移出可运行队列后,如果队列为空将执行一次cpu之间的负载平衡操作,然后执行以下语句
    prev->sched_class->put_prev_task(rq, prev);
    同样,对于cfs调度类,put_prev_task对应put_prev_task_fair,  
    /*
     * Account for a descheduled task:
     */
    static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
    {
    	struct sched_entity *se = &prev->se;
    	struct cfs_rq *cfs_rq;
    
    	for_each_sched_entity(se) {
    		cfs_rq = cfs_rq_of(se);
    		put_prev_entity(cfs_rq, se);
    	}
    }
    同样忽略组调度,进入put_prev_entity
    static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
    {
    	/*
    	 * If still on the runqueue then deactivate_task()
    	 * was not called and update_curr() has to be done:
    	 */
    	if (prev->on_rq)
    		update_curr(cfs_rq);
    
    	check_spread(cfs_rq, prev);
    	if (prev->on_rq) {
    		update_stats_wait_start(cfs_rq, prev);
    		/* Put 'current' back into the tree. */
    		__enqueue_entity(cfs_rq, prev);
    	}
    	cfs_rq->curr = NULL;
    按照前面的分析,在这个情景中,这时prev->on_rq已经为零了,所以这个函数实际上只是把cfs_rq->currl置空。 再次回到schedule的主路线中,当前进程已经被移走了,接下来就要选择一个新的进程来执行了
      prev->sched_class->put_prev_task(rq, prev);
      next = pick_next_task(rq, prev);
     
    /*
     * Pick up the highest-prio task:
     */
    static inline struct task_struct *
    pick_next_task(struct rq *rq, struct task_struct *prev)
    {
    	const struct sched_class *class;
    	struct task_struct *p;
    
    	/*
    	 * Optimization: we know that if all tasks are in
    	 * the fair class we can call that function directly:
    	 */
    	if (likely(rq->nr_running == rq->cfs.nr_running)) {
    		p = fair_sched_class.pick_next_task(rq);
    		if (likely(p))
    			return p;
    	}
    
    	class = sched_class_highest;
    	for ( ; ; ) {
    		p = class->pick_next_task(rq);
    		if (p)
    			return p;
    		/*
    		 * Will never be NULL as the idle class always
    		 * returns a non-NULL p:
    		 */
    		class = class->next;
    	}
    }
    代码注释写的非常明白,我们进入到cfs调度类的代码中去看一下是怎么pick up的(其函数名想必已经能猜出来了:)
    static struct task_struct *pick_next_task_fair(struct rq *rq)
    {
    	struct task_struct *p;
    	struct cfs_rq *cfs_rq = &rq->cfs;
    	struct sched_entity *se;
    
    	if (unlikely(!cfs_rq->nr_running))
    		return NULL;
    
    	do {
    		se = pick_next_entity(cfs_rq);
    		set_next_entity(cfs_rq, se);
    		cfs_rq = group_cfs_rq(se);
    	} while (cfs_rq);
    
    	p = task_of(se);
    	hrtick_start_fair(rq, p);
    
    	return p;
    }
    再来看pick_next_entity
    static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
    {
    	struct sched_entity *se = __pick_next_entity(cfs_rq);
    
    	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)//这个留着下面分析
    		return cfs_rq->next;
    
    	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
    		return cfs_rq->last;
    
    	return se;
    }!
    居然还有更深一层~
    static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
    {
    	struct rb_node *left = cfs_rq->rb_leftmost;
    
    	if (!left)
    		return NULL;
    
    	return rb_entry(left, struct sched_entity, run_node);
    }
    唔,果然是返回红黑树的最左结点 选择到最左叶子结点后,执行set_next_entity
    static void
    set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	/* 'current' is not kept within the tree. */
    	if (se->on_rq) {
    		/*
    		 * Any task has to be enqueued before it get to execute on
    		 * a CPU. So account for the time it spent waiting on the
    		 * runqueue.
    		 */
    		update_stats_wait_end(cfs_rq, se);
    		__dequeue_entity(cfs_rq, se);
    	}
    
    	update_stats_curr_start(cfs_rq, se);
    	cfs_rq->curr = se;
    #ifdef CONFIG_SCHEDSTATS
    	/*
    	 * Track our maximum slice length, if the CPU's load is at
    	 * least twice that of our own weight (i.e. dont track it
    	 * when there are only lesser-weight tasks around):
    	 */
    	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
    		se->slice_max = max(se->slice_max,
    			se->sum_exec_runtime - se->prev_sum_exec_runtime);
    	}
    #endif
    	se->prev_sum_exec_runtime = se->sum_exec_runtime;
    }
    哦,将他从树中拿出来了,并更新它的prev_sum_exec_runtime信息,(还记得吧,在时钟中断的处理中,被调入后,当前进程的运行时间就是用now值减去这个prev_sum_exec_runtime的,然后判断是否需要被换出) 终于,pick up的工作完成了,旧的进程也已经从队列中移除了,下面可以完成交接工作,来让新的进程运行了,由于其与调度器特性无关,我们暂不分析。   分析了新建进程,进程主动放弃cpu,现在只剩下一种情景没分析了,那就是被唤醒,其实在那种情景下,如果不考虑SMP平衡负载的话,其实只做了一件事,就是将所对应进程加入可运行队列,更新可运行进程数,更新负载,task state置TASK_RUNNING,然后检查是否可以抢占当前进程,如是则进行置位。就是这样而已,相当简单,只给出片断:
    	update_rq_clock(rq);
    	activate_task(rq, p, 1);
    	success = 1;
    
    out_running:
    	trace_sched_wakeup(rq, p);
    	check_preempt_curr(rq, p, sync);
    
    	p->state = TASK_RUNNING;
           
  • 相关阅读:
    IOS开发银行系统的四舍五入的算法
    线程通信之初认识
    多线程同步机制练习之银行存钱
    解决线程安全问题方式三:loke锁
    使用同步机制解决单例模式中的懒汉式的线程安全问题
    (对于继承Thread类)线程安全问题解决方式二:同步静态方法
    (对于实现Runnable接口)线程安全问题解决方式二:同步方法
    (对于继承Thread类)线程安全问题解决方式一:同步代码块
    (对于实现Runnable接口)线程安全问题解决方式一:同步代码块
    三个窗口卖票(实现Runnable方式创建线程)之线程安全问题
  • 原文地址:https://www.cnblogs.com/yangce/p/2910094.html
Copyright © 2011-2022 走看看