zoukankan      html  css  js  c++  java
  • Linux soft lockup分析

    关键词:watchdog、soft lockup、percpu thread、lockdep等。

    近日遇到一个soft lockup问题,打印类似“[ 56.032356] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [cat:153]“。

    这是lockup检测机制在起作用,lockup检测机制包括soft lockup detector和hard lockup detector。

    借机分析下soft lockup机制以及什么情况下导致soft watchdog异常、对watchdog的配置、如何定位异常点。

    这里跳过hard lockup detector的分析。

    1. soft lockup机制分析

    lockup_detector_init()函数首先获取sample_period以及watchdog_cpumask,然后根据情况创建线程,启动喂狗程序;创建hrtimer启动看门狗。

    然后有两个重点一个是创建内核线程的API以及struct smp_hotplug_thread结构体。

    void __init lockup_detector_init(void)
    {
        set_sample_period();----------------------------------------获取变量sample_period,为watchdog_thresh*2/5,即4秒喂一次狗。
    ...
        cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
    
        if (watchdog_enabled)
            watchdog_enable_all_cpus();
    }
    
    static int watchdog_enable_all_cpus(void)
    {
        int err = 0;
    
        if (!watchdog_running) {----------------------------------如果当前watchdog_running没有再运行,那么为每个CPU创建一个watchdog/x线程,这些线程每隔sample_period时间喂一次狗。watchdog_threads时watchdog/x线程的主要输入参数,watchdog_cpumask规定了为哪些CPU创建线程。
            err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
                                     &watchdog_cpumask);
            if (err)
                pr_err("Failed to create watchdog threads, disabled
    ");
            else
                watchdog_running = 1;
        } else {
            err = update_watchdog_all_cpus();
    
            if (err) {
                watchdog_disable_all_cpus();
                pr_err("Failed to update lockup detectors, disabled
    ");
            }
        }
    
        if (err)
            watchdog_enabled = 0;
    
        return err;
    }
    
    static void watchdog_disable_all_cpus(void)
    {
        if (watchdog_running) {
            watchdog_running = 0;
            smpboot_unregister_percpu_thread(&watchdog_threads);
        }
    }
    
    static int update_watchdog_all_cpus(void)
    {
        int ret;
    
        ret = watchdog_park_threads();
        if (ret)
            return ret;
    
        watchdog_unpark_threads();
    
        return 0;
    }
    
    static int watchdog_park_threads(void)
    {
        int cpu, ret = 0;
    
        atomic_set(&watchdog_park_in_progress, 1);
    
        for_each_watchdog_cpu(cpu) {
            ret = kthread_park(per_cpu(softlockup_watchdog, cpu));---------------------------设置struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x线程中会调用unpark成员函数进行处理。
            if (ret)
                break;
        }
    
        atomic_set(&watchdog_park_in_progress, 0);
    
        return ret;
    }
    
    static void watchdog_unpark_threads(void)
    {
        int cpu;
    
        for_each_watchdog_cpu(cpu)
            kthread_unpark(per_cpu(softlockup_watchdog, cpu));-------------------------------清空struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x线程中会调用park成员函数。
    }

    1.1 watchdog_threads结构体介绍

    在介绍如何创建watchdog/x线程之前,有必要先介绍一些struct smp_hotplug_thread线程。

    struct smp_hotplug_thread {
        struct task_struct __percpu    **store;--------------------------存放percpu strcut task_strcut指针的指针。
        struct list_head        list;
        int                (*thread_should_run)(unsigned int cpu);-------检查是否应该运行watchdog/x线程。
        void                (*thread_fn)(unsigned int cpu);--------------watchdog/x线程的主函数。
        void                (*create)(unsigned int cpu);
        void                (*setup)(unsigned int cpu);------------------在运行watchdog/x线程之前的准备工作。
        void                (*cleanup)(unsigned int cpu, bool online);---在退出watchdog/x线程之后的清楚工作。
        void                (*park)(unsigned int cpu);-------------------当CPU offline时,需要临时停止。
        void                (*unpark)(unsigned int cpu);-----------------当CPU变成online时,进行准备工作。
        cpumask_var_t            cpumask;--------------------------------允许哪些CPU online。
        bool                selfparking;
        const char            *thread_comm;------------------------------watchdog/x线程名称。
    };

     watchdog_threads是soft lockup监控线程的实体,基于此创建 watchdog/x线程。

    static struct smp_hotplug_thread watchdog_threads = {
        .store            = &softlockup_watchdog,
        .thread_should_run    = watchdog_should_run,
        .thread_fn        = watchdog,
        .thread_comm        = "watchdog/%u",
        .setup            = watchdog_enable,
        .cleanup        = watchdog_cleanup,
        .park            = watchdog_disable,
        .unpark            = watchdog_enable,
    };
    
    static void watchdog_enable(unsigned int cpu)
    {
        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
    
        /* kick off the timer for the hardlockup detector */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;------------------------------------------创建一个hrtimer,超时函数为watchdog_timer_fn,这里面会检查watchdog_touch_ts变量是否超过20秒没有被更新。如果是,则有soft lockup。
    
        /* Enable the perf event */
        watchdog_nmi_enable(cpu);
    
        /* done here because hrtimer_start can only pin to smp_processor_id() */
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                  HRTIMER_MODE_REL_PINNED);---------------------------------------------启动一个超时为sample_period(4秒)的hrtimer,HRTIMER_MODE_REL_PINNED表示此hrtimer和当前CPU绑定。
    
        /* initialize timestamp */
        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);---------------------------------设置当前线程为实时FIFO,并且优先级为实时99.这个优先级表示高于所有的非实时线程,但是实时优先级最低的。
        __touch_watchdog();-------------------------------------------------------------更新watchdog_touch_ts变量,相当于喂狗操作。
    }
    
    static void watchdog_set_prio(unsigned int policy, unsigned int prio)
    {
        struct sched_param param = { .sched_priority = prio };
    
        sched_setscheduler(current, policy, &param);
    }
    
    /* Commands for resetting the watchdog */
    static void __touch_watchdog(void)
    {
        __this_cpu_write(watchdog_touch_ts, get_timestamp());----------------------------喂狗的操作就是更新watchdog_touch_ts变量,也即当前时间戳。
    }
    
    
    static void watchdog_disable(unsigned int cpu)-------------------------------------相当于watchdog_enable()反操作,将线程恢复为普通线程;取消hrtimer。
    {
        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
    
        watchdog_set_prio(SCHED_NORMAL, 0);
        hrtimer_cancel(hrtimer);
        /* disable the perf event */
        watchdog_nmi_disable(cpu);
    }
    
    static void watchdog_cleanup(unsigned int cpu, bool online)
    {
        watchdog_disable(cpu);
    }
    
    static int watchdog_should_run(unsigned int cpu)
    {
        return __this_cpu_read(hrtimer_interrupts) !=
            __this_cpu_read(soft_lockup_hrtimer_cnt);------------------------------------hrtimer_interrupts记录了产生hrtimer的次数;在watchdog()中,将hrtimer_interrupts赋给soft_lockup_hrtimer_cnt。两者相等表示没有hrtimer产生,不需要运行watchdog/x线程;相反不等,则需要watchdog/x线程运行。
    }
    static void watchdog(unsigned int cpu)
    {
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                 __this_cpu_read(hrtimer_interrupts));-----------------------------------更新soft_lockup_hrtimer_cnt,在watch_should_run()中就返回false,表示线程不需要运行,即不需要喂狗。
        __touch_watchdog();--------------------------------------------------------------虽然就是一句话,但是却很重要的喂狗操作。
    
        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
            watchdog_nmi_disable(cpu);
    }

    1.2 创建喂狗线程watchdog/x

    在分析了watchdog_threads之后,再来看看如何创建watchdog/x线程。 

    int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
                           const struct cpumask *cpumask)
    {
        unsigned int cpu;
        int ret = 0;
    
        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
            return -ENOMEM;
        cpumask_copy(plug_thread->cpumask, cpumask);
    
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {------------------------------------------------遍历所有online CPU,为每个CPU创建一个percpu的watchdog/x线程。
            ret = __smpboot_create_thread(plug_thread, cpu);
            if (ret) {
                smpboot_destroy_threads(plug_thread);-----------------------------创建失败则释放相关资源。
                free_cpumask_var(plug_thread->cpumask);
                goto out;
            }
            if (cpumask_test_cpu(cpu, cpumask))
                smpboot_unpark_thread(plug_thread, cpu);--------------------------如果当前CPU不在cpumask中,则清空KTHREAD_SHOULD_PARK,进而调用watchdog_therads的umpark成员函数。
        }
        list_add(&plug_thread->list, &hotplug_threads);
    out:
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
        return ret;
    }
    
    static int
    __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
    {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
        struct smpboot_thread_data *td;
    
        if (tsk)
            return 0;
    
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
        if (!td)
            return -ENOMEM;
        td->cpu = cpu;
        td->ht = ht;
    
        tsk =kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                        ht->thread_comm);-----------------------------------------在指定CPU上创建watchdog/x线程,处理函数为smpboot_thread_fn()。
        if (IS_ERR(tsk)) {
            kfree(td);
            return PTR_ERR(tsk);
        }
        /*
         * Park the thread so that it could start right on the CPU
         * when it is available.
         */
        kthread_park(tsk);--------------------------------------------------------在CPU上立即启动watchdog/x线程。
        get_task_struct(tsk);-----------------------------------------------------增加对线程的引用计数。
        *per_cpu_ptr(ht->store, cpu) = tsk;---------------------------------------store存放线程结构体指针的指针。
        if (ht->create) {
            if (!wait_task_inactive(tsk, TASK_PARKED))
                WARN_ON(1);
            else
                ht->create(cpu);
        }
        return 0;
    }
    
    static int smpboot_thread_fn(void *data)
    {
        struct smpboot_thread_data *td = data;
        struct smp_hotplug_thread *ht = td->ht;
    
        while (1) {
            set_current_state(TASK_INTERRUPTIBLE);
            preempt_disable();
            if (kthread_should_stop()) {----------------------------------------如果可以终止线程,调用cleanup,退出线程。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                /* cleanup must mirror setup */
                if (ht->cleanup && td->status != HP_THREAD_NONE)
                    ht->cleanup(td->cpu, cpu_online(td->cpu));
                kfree(td);
                return 0;
            }
    
            if (kthread_should_park()) {----------------------------------------如果KTHREAD_SHOULD_PARK置位,调用park()暂停进程执行。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->park && td->status == HP_THREAD_ACTIVE) {
                    BUG_ON(td->cpu != smp_processor_id());
                    ht->park(td->cpu);
                    td->status = HP_THREAD_PARKED;
                }
                kthread_parkme();
                /* We might have been woken for stop */
                continue;
            }
    
            BUG_ON(td->cpu != smp_processor_id());
    
            /* Check for state change setup */
            switch (td->status) {
            case HP_THREAD_NONE:-----------------------------------------------相当于第一次运行,调用setup()进行初始化操作。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->setup)
                    ht->setup(td->cpu);
                td->status = HP_THREAD_ACTIVE;
                continue;
    
            case HP_THREAD_PARKED:---------------------------------------------从parked状态恢复。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->unpark)
                    ht->unpark(td->cpu);
                td->status = HP_THREAD_ACTIVE;
                continue;
            }
    
            if (!ht->thread_should_run(td->cpu)) {-----------------------------如果不需要进程运行,schedule()主动放弃CPU给其他线程使用。
                preempt_enable_no_resched();
                schedule();
            } else {
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                ht->thread_fn(td->cpu);----------------------------------------调用struct smpboot_thread_fn->thread_fn及watchdog(),进行喂狗操作。
            }
        }
    }
    
    void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)----将创建的内核线程移除操作。
    {
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        list_del(&plug_thread->list);
        smpboot_destroy_threads(plug_thread);
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
        free_cpumask_var(plug_thread->cpumask);
    }
    
    static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
    {
        unsigned int cpu;
    
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
            struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
    
            if (tsk) {
                kthread_stop(tsk);
                put_task_struct(tsk);
                *per_cpu_ptr(ht->store, cpu) = NULL;
            }
        }
    }

    1.3 hrtimer看门狗

     在分析了喂狗线程watchdog/x之后,再来分析看门狗是如何实现的?

    看门狗是通过启动一个周期为4秒的hrtimer来实现的,这个hrtimer和CPU绑定,使用的变量都是percpu的。确保每个CPU之间不相互干扰。

    每次hrtimer超时,都会唤醒watchdog/x线程,并进行一次喂狗操作。

    因为hrtimer超时函数在软中断中调用,在中断产生后会比线程优先得到执行。

    所以在watchdog/x线程没有得到执行的情况下,通过is_softlockup()来判断看门狗是否超过20秒没有得到喂狗。

    static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
    {
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
    
        if (atomic_read(&watchdog_park_in_progress) != 0)
            return HRTIMER_NORESTART;
    
        /* kick the hardlockup detector */
        watchdog_interrupt_count();------------------------------------------------------------------没产生一次中断,hrtimer_interrupts计数加1.hrtimer_interrupts记录了产生hrtimer的次数。
    
        /* kick the softlockup detector */
        wake_up_process(__this_cpu_read(softlockup_watchdog));---------------------------------------唤醒watchdog/x线程,进行喂狗操作。
    
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));------------------------------------重新设置超时点,形成周期性时钟。
    ...
        duration = is_softlockup(touch_ts);----------------------------------------------------------返回非0表示,看门狗超时。
        if (unlikely(duration)) {--------------------------------------------------------------------看门狗超时情况的处理。
            if (kvm_check_and_clear_guest_paused())
                return HRTIMER_RESTART;
    
            /* only warn once */
            if (__this_cpu_read(soft_watchdog_warn) == true) {
                if (__this_cpu_read(softlockup_task_ptr_saved) !=
                    current) {
                    __this_cpu_write(soft_watchdog_warn, false);
                    __touch_watchdog();
                }
                return HRTIMER_RESTART;
            }
    
            if (softlockup_all_cpu_backtrace) {
                if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
                    /* Someone else will report us. Let's give up */
                    __this_cpu_write(soft_watchdog_warn, true);
                    return HRTIMER_RESTART;
                }
            }
    
            pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]
    ",
                smp_processor_id(), duration,
                current->comm, task_pid_nr(current));-------------------------------------------------打印哪个CPU被卡死duration秒,以及死在哪个进程。
            __this_cpu_write(softlockup_task_ptr_saved, current);
            print_modules();
            print_irqtrace_events(current);-----------------------------------------------------------显示开关中断、软中断信息,禁止中断和软中断也是造成soft lockup的一个原因。
            if (regs)---------------------------------------------------------------------------------有寄存器显示寄存器信息,同时显示栈信息。
                show_regs(regs);
            else
                dump_stack();
    
            if (softlockup_all_cpu_backtrace) {
                trigger_allbutself_cpu_backtrace();
    
                clear_bit(0, &soft_lockup_nmi_warn);
                /* Barrier to sync with other cpus */
                smp_mb__after_atomic();
            }
    
            add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
            if (softlockup_panic)---------------------------------------------------------------------如果定义softlockup_panic则进入panic()。
                panic("softlockup: hung tasks");
            __this_cpu_write(soft_watchdog_warn, true);
        } else
            __this_cpu_write(soft_watchdog_warn, false);
    
        return HRTIMER_RESTART;
    }

      static void watchdog_interrupt_count(void)
      {
          __this_cpu_inc(hrtimer_interrupts);
      }

    static int is_softlockup(unsigned long touch_ts)
    {
        unsigned long now = get_timestamp();
    
        if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
            /* Warn about unreasonable delays. */
            if (time_after(now, touch_ts + get_softlockup_thresh()))
                return now - touch_ts;
        }
        return 0;
    }

    2. 对watchdog的设置

     对watchdog行为的设置有两个途径:通过命令行传入参数和通过proc设置。

    2.1 通过命令行设置

    通过命令行传入参数,可以对soft lockup进行开关设置、超时过后是否panic等等行为。

    static int __init softlockup_panic_setup(char *str)
    {
        softlockup_panic = simple_strtoul(str, NULL, 0);
    
        return 1;
    }
    __setup("softlockup_panic=", softlockup_panic_setup);
    
    static int __init nowatchdog_setup(char *str)
    {
        watchdog_enabled = 0;
        return 1;
    }
    __setup("nowatchdog", nowatchdog_setup);
    
    static int __init nosoftlockup_setup(char *str)
    {
        watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
        return 1;
    }
    __setup("nosoftlockup", nosoftlockup_setup);
    
    #ifdef CONFIG_SMP
    static int __init softlockup_all_cpu_backtrace_setup(char *str)
    {
        sysctl_softlockup_all_cpu_backtrace =
            !!simple_strtol(str, NULL, 0);
        return 1;
    }
    __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
    static int __init hardlockup_all_cpu_backtrace_setup(char *str)
    {
        sysctl_hardlockup_all_cpu_backtrace =
            !!simple_strtol(str, NULL, 0);
        return 1;
    }
    __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
    #endif

    2.2 通过sysfs节点调节watchdog

     watchdog相关的配置还可以通过proc文件系统进行配置。

    /proc/sys/kernel/nmi_watchdog-------------------------hard lockup开关,proc_nmi_watchdog()。
    /proc/sys/kernel/soft_watchdog------------------------soft lockup开关,proc_soft_watchdog()。
    /proc/sys/kernel/watchdog-----------------------------watchdog总开关,proc_watchdog()。
    /proc/sys/kernel/watchdog_cpumask---------------------watchdog cpumaks,proc_watchdog_cpumask()。
    /proc/sys/kernel/watchdog_thresh----------------------watchdog超时阈值设置,proc_watchdog_thresh()。

    3. 定位soft lockup异常

    引起soft lockup的原因一般是死循环或者死锁, 死循环可以通过栈回溯找到问题点;死锁问题需要打开内核的lockdep功能。

    打开内核的lockdep功能可以参考《Linux死锁检测-Lockdep》。

    下面看一个while(1)引起的soft lockup异常分析:

    [ 5656.032325] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [cat:157]-----------------------CPU、进程等信息粗略定位。
    [ 5656.039314] Modules linked in:
    [ 5656.042386] 
    [ 5656.042386] CURRENT PROCESS:
    [ 5656.042386] 
    [ 5656.048229] COMM=cat PID=157
    [ 5656.051117] TEXT=00008000-000c5a68 DATA=000c6f1c-000c7175 BSS=000c7175-000c8000
    [ 5656.058432] USER-STACK=7fc1ee50  KERNEL-STACK=bd0b7080
    [ 5656.058432] 
    [ 5656.065069] PC: 0x8032a1b2 (clk_summary_show+0x62/0xb4)--------------------------------------------PC指向出问题的点,更加精确的定位。
    [ 5656.070302] LR: 0x8032a186 (clk_summary_show+0x36/0xb4)
    [ 5656.075531] SP: 0xbd8b1b74...
    [ 5656.217622] 
    Call Trace:-----------------------------------------------------------------------------------------通过Call Trace,可以了解如何做到PC指向的问题点的。来龙去脉一目了然。
    [<80155c5e>] seq_read+0xc2/0x46c
    [<802826ac>] full_proxy_read+0x58/0x98
    [<8013239c>] do_readv_writev+0x31c/0x384
    [<80132458>] vfs_readv+0x54/0x8c
    [<80160b52>] default_file_splice_read+0x166/0x2b0
    [<801606ee>] do_splice_to+0x76/0xb0
    [<801607de>] splice_direct_to_actor+0xb6/0x21c
    [<801609c2>] do_splice_direct+0x7e/0xa8
    [<80132a5a>] do_sendfile+0x21a/0x45c
    [<80133776>] SyS_sendfile64+0xf6/0xfc
    [<80046186>] csky_systemcall+0x96/0xe0
  • 相关阅读:
    LAMP网站架构解释
    ftp--pureftpd1.0.46
    给远程主机起别名
    ssh修改端口号并进行远程访问
    ssh使两台机器建立连接
    Linux搭建svn服务
    centos上git搭建
    centos上Jenkins搭建
    kvm安装准备
    服务器Java环境配置
  • 原文地址:https://www.cnblogs.com/arnoldlu/p/10338850.html
Copyright © 2011-2022 走看看