zoukankan      html  css  js  c++  java
  • Linux soft lockup分析

    关键词:watchdog、soft lockup、percpu thread、lockdep等。

    近日遇到一个soft lockup问题,打印类似“[ 56.032356] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [cat:153]“。

    这是lockup检测机制在起作用,lockup检测机制包括soft lockup detector和hard lockup detector。

    借机分析下soft lockup机制以及什么情况下导致soft watchdog异常、对watchdog的配置、如何定位异常点。

    这里跳过hard lockup detector的分析。

    1. soft lockup机制分析

    lockup_detector_init()函数首先获取sample_period以及watchdog_cpumask,然后根据情况创建线程,启动喂狗程序;创建hrtimer启动看门狗。

    然后有两个重点一个是创建内核线程的API以及struct smp_hotplug_thread结构体。

    void __init lockup_detector_init(void)
    {
        set_sample_period();----------------------------------------获取变量sample_period,为watchdog_thresh*2/5,即4秒喂一次狗。
    ...
        cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
    
        if (watchdog_enabled)
            watchdog_enable_all_cpus();
    }
    
    static int watchdog_enable_all_cpus(void)
    {
        int err = 0;
    
        if (!watchdog_running) {----------------------------------如果当前watchdog_running没有再运行,那么为每个CPU创建一个watchdog/x线程,这些线程每隔sample_period时间喂一次狗。watchdog_threads时watchdog/x线程的主要输入参数,watchdog_cpumask规定了为哪些CPU创建线程。
            err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
                                     &watchdog_cpumask);
            if (err)
                pr_err("Failed to create watchdog threads, disabled
    ");
            else
                watchdog_running = 1;
        } else {
            err = update_watchdog_all_cpus();
    
            if (err) {
                watchdog_disable_all_cpus();
                pr_err("Failed to update lockup detectors, disabled
    ");
            }
        }
    
        if (err)
            watchdog_enabled = 0;
    
        return err;
    }
    
    static void watchdog_disable_all_cpus(void)
    {
        if (watchdog_running) {
            watchdog_running = 0;
            smpboot_unregister_percpu_thread(&watchdog_threads);
        }
    }
    
    static int update_watchdog_all_cpus(void)
    {
        int ret;
    
        ret = watchdog_park_threads();
        if (ret)
            return ret;
    
        watchdog_unpark_threads();
    
        return 0;
    }
    
    static int watchdog_park_threads(void)
    {
        int cpu, ret = 0;
    
        atomic_set(&watchdog_park_in_progress, 1);
    
        for_each_watchdog_cpu(cpu) {
            ret = kthread_park(per_cpu(softlockup_watchdog, cpu));---------------------------设置struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x线程中会调用unpark成员函数进行处理。
            if (ret)
                break;
        }
    
        atomic_set(&watchdog_park_in_progress, 0);
    
        return ret;
    }
    
    static void watchdog_unpark_threads(void)
    {
        int cpu;
    
        for_each_watchdog_cpu(cpu)
            kthread_unpark(per_cpu(softlockup_watchdog, cpu));-------------------------------清空struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x线程中会调用park成员函数。
    }

    1.1 watchdog_threads结构体介绍

    在介绍如何创建watchdog/x线程之前,有必要先介绍一些struct smp_hotplug_thread线程。

    struct smp_hotplug_thread {
        struct task_struct __percpu    **store;--------------------------存放percpu strcut task_strcut指针的指针。
        struct list_head        list;
        int                (*thread_should_run)(unsigned int cpu);-------检查是否应该运行watchdog/x线程。
        void                (*thread_fn)(unsigned int cpu);--------------watchdog/x线程的主函数。
        void                (*create)(unsigned int cpu);
        void                (*setup)(unsigned int cpu);------------------在运行watchdog/x线程之前的准备工作。
        void                (*cleanup)(unsigned int cpu, bool online);---在退出watchdog/x线程之后的清楚工作。
        void                (*park)(unsigned int cpu);-------------------当CPU offline时,需要临时停止。
        void                (*unpark)(unsigned int cpu);-----------------当CPU变成online时,进行准备工作。
        cpumask_var_t            cpumask;--------------------------------允许哪些CPU online。
        bool                selfparking;
        const char            *thread_comm;------------------------------watchdog/x线程名称。
    };

     watchdog_threads是soft lockup监控线程的实体,基于此创建 watchdog/x线程。

    static struct smp_hotplug_thread watchdog_threads = {
        .store            = &softlockup_watchdog,
        .thread_should_run    = watchdog_should_run,
        .thread_fn        = watchdog,
        .thread_comm        = "watchdog/%u",
        .setup            = watchdog_enable,
        .cleanup        = watchdog_cleanup,
        .park            = watchdog_disable,
        .unpark            = watchdog_enable,
    };
    
    static void watchdog_enable(unsigned int cpu)
    {
        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
    
        /* kick off the timer for the hardlockup detector */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;------------------------------------------创建一个hrtimer,超时函数为watchdog_timer_fn,这里面会检查watchdog_touch_ts变量是否超过20秒没有被更新。如果是,则有soft lockup。
    
        /* Enable the perf event */
        watchdog_nmi_enable(cpu);
    
        /* done here because hrtimer_start can only pin to smp_processor_id() */
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                  HRTIMER_MODE_REL_PINNED);---------------------------------------------启动一个超时为sample_period(4秒)的hrtimer,HRTIMER_MODE_REL_PINNED表示此hrtimer和当前CPU绑定。
    
        /* initialize timestamp */
        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);---------------------------------设置当前线程为实时FIFO,并且优先级为实时99.这个优先级表示高于所有的非实时线程,但是实时优先级最低的。
        __touch_watchdog();-------------------------------------------------------------更新watchdog_touch_ts变量,相当于喂狗操作。
    }
    
    static void watchdog_set_prio(unsigned int policy, unsigned int prio)
    {
        struct sched_param param = { .sched_priority = prio };
    
        sched_setscheduler(current, policy, &param);
    }
    
    /* Commands for resetting the watchdog */
    static void __touch_watchdog(void)
    {
        __this_cpu_write(watchdog_touch_ts, get_timestamp());----------------------------喂狗的操作就是更新watchdog_touch_ts变量,也即当前时间戳。
    }
    
    
    static void watchdog_disable(unsigned int cpu)-------------------------------------相当于watchdog_enable()反操作,将线程恢复为普通线程;取消hrtimer。
    {
        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
    
        watchdog_set_prio(SCHED_NORMAL, 0);
        hrtimer_cancel(hrtimer);
        /* disable the perf event */
        watchdog_nmi_disable(cpu);
    }
    
    static void watchdog_cleanup(unsigned int cpu, bool online)
    {
        watchdog_disable(cpu);
    }
    
    static int watchdog_should_run(unsigned int cpu)
    {
        return __this_cpu_read(hrtimer_interrupts) !=
            __this_cpu_read(soft_lockup_hrtimer_cnt);------------------------------------hrtimer_interrupts记录了产生hrtimer的次数;在watchdog()中,将hrtimer_interrupts赋给soft_lockup_hrtimer_cnt。两者相等表示没有hrtimer产生,不需要运行watchdog/x线程;相反不等,则需要watchdog/x线程运行。
    }
    static void watchdog(unsigned int cpu)
    {
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                 __this_cpu_read(hrtimer_interrupts));-----------------------------------更新soft_lockup_hrtimer_cnt,在watch_should_run()中就返回false,表示线程不需要运行,即不需要喂狗。
        __touch_watchdog();--------------------------------------------------------------虽然就是一句话,但是却很重要的喂狗操作。
    
        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
            watchdog_nmi_disable(cpu);
    }

    1.2 创建喂狗线程watchdog/x

    在分析了watchdog_threads之后,再来看看如何创建watchdog/x线程。 

    int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
                           const struct cpumask *cpumask)
    {
        unsigned int cpu;
        int ret = 0;
    
        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
            return -ENOMEM;
        cpumask_copy(plug_thread->cpumask, cpumask);
    
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {------------------------------------------------遍历所有online CPU,为每个CPU创建一个percpu的watchdog/x线程。
            ret = __smpboot_create_thread(plug_thread, cpu);
            if (ret) {
                smpboot_destroy_threads(plug_thread);-----------------------------创建失败则释放相关资源。
                free_cpumask_var(plug_thread->cpumask);
                goto out;
            }
            if (cpumask_test_cpu(cpu, cpumask))
                smpboot_unpark_thread(plug_thread, cpu);--------------------------如果当前CPU不在cpumask中,则清空KTHREAD_SHOULD_PARK,进而调用watchdog_therads的umpark成员函数。
        }
        list_add(&plug_thread->list, &hotplug_threads);
    out:
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
        return ret;
    }
    
    static int
    __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
    {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
        struct smpboot_thread_data *td;
    
        if (tsk)
            return 0;
    
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
        if (!td)
            return -ENOMEM;
        td->cpu = cpu;
        td->ht = ht;
    
        tsk =kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                        ht->thread_comm);-----------------------------------------在指定CPU上创建watchdog/x线程,处理函数为smpboot_thread_fn()。
        if (IS_ERR(tsk)) {
            kfree(td);
            return PTR_ERR(tsk);
        }
        /*
         * Park the thread so that it could start right on the CPU
         * when it is available.
         */
        kthread_park(tsk);--------------------------------------------------------在CPU上立即启动watchdog/x线程。
        get_task_struct(tsk);-----------------------------------------------------增加对线程的引用计数。
        *per_cpu_ptr(ht->store, cpu) = tsk;---------------------------------------store存放线程结构体指针的指针。
        if (ht->create) {
            if (!wait_task_inactive(tsk, TASK_PARKED))
                WARN_ON(1);
            else
                ht->create(cpu);
        }
        return 0;
    }
    
    static int smpboot_thread_fn(void *data)
    {
        struct smpboot_thread_data *td = data;
        struct smp_hotplug_thread *ht = td->ht;
    
        while (1) {
            set_current_state(TASK_INTERRUPTIBLE);
            preempt_disable();
            if (kthread_should_stop()) {----------------------------------------如果可以终止线程,调用cleanup,退出线程。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                /* cleanup must mirror setup */
                if (ht->cleanup && td->status != HP_THREAD_NONE)
                    ht->cleanup(td->cpu, cpu_online(td->cpu));
                kfree(td);
                return 0;
            }
    
            if (kthread_should_park()) {----------------------------------------如果KTHREAD_SHOULD_PARK置位,调用park()暂停进程执行。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->park && td->status == HP_THREAD_ACTIVE) {
                    BUG_ON(td->cpu != smp_processor_id());
                    ht->park(td->cpu);
                    td->status = HP_THREAD_PARKED;
                }
                kthread_parkme();
                /* We might have been woken for stop */
                continue;
            }
    
            BUG_ON(td->cpu != smp_processor_id());
    
            /* Check for state change setup */
            switch (td->status) {
            case HP_THREAD_NONE:-----------------------------------------------相当于第一次运行,调用setup()进行初始化操作。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->setup)
                    ht->setup(td->cpu);
                td->status = HP_THREAD_ACTIVE;
                continue;
    
            case HP_THREAD_PARKED:---------------------------------------------从parked状态恢复。
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                if (ht->unpark)
                    ht->unpark(td->cpu);
                td->status = HP_THREAD_ACTIVE;
                continue;
            }
    
            if (!ht->thread_should_run(td->cpu)) {-----------------------------如果不需要进程运行,schedule()主动放弃CPU给其他线程使用。
                preempt_enable_no_resched();
                schedule();
            } else {
                __set_current_state(TASK_RUNNING);
                preempt_enable();
                ht->thread_fn(td->cpu);----------------------------------------调用struct smpboot_thread_fn->thread_fn及watchdog(),进行喂狗操作。
            }
        }
    }
    
    void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)----将创建的内核线程移除操作。
    {
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        list_del(&plug_thread->list);
        smpboot_destroy_threads(plug_thread);
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
        free_cpumask_var(plug_thread->cpumask);
    }
    
    static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
    {
        unsigned int cpu;
    
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
            struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
    
            if (tsk) {
                kthread_stop(tsk);
                put_task_struct(tsk);
                *per_cpu_ptr(ht->store, cpu) = NULL;
            }
        }
    }

    1.3 hrtimer看门狗

     在分析了喂狗线程watchdog/x之后,再来分析看门狗是如何实现的?

    看门狗是通过启动一个周期为4秒的hrtimer来实现的,这个hrtimer和CPU绑定,使用的变量都是percpu的。确保每个CPU之间不相互干扰。

    每次hrtimer超时,都会唤醒watchdog/x线程,并进行一次喂狗操作。

    因为hrtimer超时函数在软中断中调用,在中断产生后会比线程优先得到执行。

    所以在watchdog/x线程没有得到执行的情况下,通过is_softlockup()来判断看门狗是否超过20秒没有得到喂狗。

    static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
    {
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
    
        if (atomic_read(&watchdog_park_in_progress) != 0)
            return HRTIMER_NORESTART;
    
        /* kick the hardlockup detector */
        watchdog_interrupt_count();------------------------------------------------------------------没产生一次中断,hrtimer_interrupts计数加1.hrtimer_interrupts记录了产生hrtimer的次数。
    
        /* kick the softlockup detector */
        wake_up_process(__this_cpu_read(softlockup_watchdog));---------------------------------------唤醒watchdog/x线程,进行喂狗操作。
    
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));------------------------------------重新设置超时点,形成周期性时钟。
    ...
        duration = is_softlockup(touch_ts);----------------------------------------------------------返回非0表示,看门狗超时。
        if (unlikely(duration)) {--------------------------------------------------------------------看门狗超时情况的处理。
            if (kvm_check_and_clear_guest_paused())
                return HRTIMER_RESTART;
    
            /* only warn once */
            if (__this_cpu_read(soft_watchdog_warn) == true) {
                if (__this_cpu_read(softlockup_task_ptr_saved) !=
                    current) {
                    __this_cpu_write(soft_watchdog_warn, false);
                    __touch_watchdog();
                }
                return HRTIMER_RESTART;
            }
    
            if (softlockup_all_cpu_backtrace) {
                if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
                    /* Someone else will report us. Let's give up */
                    __this_cpu_write(soft_watchdog_warn, true);
                    return HRTIMER_RESTART;
                }
            }
    
            pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]
    ",
                smp_processor_id(), duration,
                current->comm, task_pid_nr(current));-------------------------------------------------打印哪个CPU被卡死duration秒,以及死在哪个进程。
            __this_cpu_write(softlockup_task_ptr_saved, current);
            print_modules();
            print_irqtrace_events(current);-----------------------------------------------------------显示开关中断、软中断信息,禁止中断和软中断也是造成soft lockup的一个原因。
            if (regs)---------------------------------------------------------------------------------有寄存器显示寄存器信息,同时显示栈信息。
                show_regs(regs);
            else
                dump_stack();
    
            if (softlockup_all_cpu_backtrace) {
                trigger_allbutself_cpu_backtrace();
    
                clear_bit(0, &soft_lockup_nmi_warn);
                /* Barrier to sync with other cpus */
                smp_mb__after_atomic();
            }
    
            add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
            if (softlockup_panic)---------------------------------------------------------------------如果定义softlockup_panic则进入panic()。
                panic("softlockup: hung tasks");
            __this_cpu_write(soft_watchdog_warn, true);
        } else
            __this_cpu_write(soft_watchdog_warn, false);
    
        return HRTIMER_RESTART;
    }

      static void watchdog_interrupt_count(void)
      {
          __this_cpu_inc(hrtimer_interrupts);
      }

    static int is_softlockup(unsigned long touch_ts)
    {
        unsigned long now = get_timestamp();
    
        if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
            /* Warn about unreasonable delays. */
            if (time_after(now, touch_ts + get_softlockup_thresh()))
                return now - touch_ts;
        }
        return 0;
    }

    2. 对watchdog的设置

     对watchdog行为的设置有两个途径:通过命令行传入参数和通过proc设置。

    2.1 通过命令行设置

    通过命令行传入参数,可以对soft lockup进行开关设置、超时过后是否panic等等行为。

    static int __init softlockup_panic_setup(char *str)
    {
        softlockup_panic = simple_strtoul(str, NULL, 0);
    
        return 1;
    }
    __setup("softlockup_panic=", softlockup_panic_setup);
    
    static int __init nowatchdog_setup(char *str)
    {
        watchdog_enabled = 0;
        return 1;
    }
    __setup("nowatchdog", nowatchdog_setup);
    
    static int __init nosoftlockup_setup(char *str)
    {
        watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
        return 1;
    }
    __setup("nosoftlockup", nosoftlockup_setup);
    
    #ifdef CONFIG_SMP
    static int __init softlockup_all_cpu_backtrace_setup(char *str)
    {
        sysctl_softlockup_all_cpu_backtrace =
            !!simple_strtol(str, NULL, 0);
        return 1;
    }
    __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
    static int __init hardlockup_all_cpu_backtrace_setup(char *str)
    {
        sysctl_hardlockup_all_cpu_backtrace =
            !!simple_strtol(str, NULL, 0);
        return 1;
    }
    __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
    #endif

    2.2 通过sysfs节点调节watchdog

     watchdog相关的配置还可以通过proc文件系统进行配置。

    /proc/sys/kernel/nmi_watchdog-------------------------hard lockup开关,proc_nmi_watchdog()。
    /proc/sys/kernel/soft_watchdog------------------------soft lockup开关,proc_soft_watchdog()。
    /proc/sys/kernel/watchdog-----------------------------watchdog总开关,proc_watchdog()。
    /proc/sys/kernel/watchdog_cpumask---------------------watchdog cpumaks,proc_watchdog_cpumask()。
    /proc/sys/kernel/watchdog_thresh----------------------watchdog超时阈值设置,proc_watchdog_thresh()。

    3. 定位soft lockup异常

    引起soft lockup的原因一般是死循环或者死锁, 死循环可以通过栈回溯找到问题点;死锁问题需要打开内核的lockdep功能。

    打开内核的lockdep功能可以参考《Linux死锁检测-Lockdep》。

    下面看一个while(1)引起的soft lockup异常分析:

    [ 5656.032325] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [cat:157]-----------------------CPU、进程等信息粗略定位。
    [ 5656.039314] Modules linked in:
    [ 5656.042386] 
    [ 5656.042386] CURRENT PROCESS:
    [ 5656.042386] 
    [ 5656.048229] COMM=cat PID=157
    [ 5656.051117] TEXT=00008000-000c5a68 DATA=000c6f1c-000c7175 BSS=000c7175-000c8000
    [ 5656.058432] USER-STACK=7fc1ee50  KERNEL-STACK=bd0b7080
    [ 5656.058432] 
    [ 5656.065069] PC: 0x8032a1b2 (clk_summary_show+0x62/0xb4)--------------------------------------------PC指向出问题的点,更加精确的定位。
    [ 5656.070302] LR: 0x8032a186 (clk_summary_show+0x36/0xb4)
    [ 5656.075531] SP: 0xbd8b1b74...
    [ 5656.217622] 
    Call Trace:-----------------------------------------------------------------------------------------通过Call Trace,可以了解如何做到PC指向的问题点的。来龙去脉一目了然。
    [<80155c5e>] seq_read+0xc2/0x46c
    [<802826ac>] full_proxy_read+0x58/0x98
    [<8013239c>] do_readv_writev+0x31c/0x384
    [<80132458>] vfs_readv+0x54/0x8c
    [<80160b52>] default_file_splice_read+0x166/0x2b0
    [<801606ee>] do_splice_to+0x76/0xb0
    [<801607de>] splice_direct_to_actor+0xb6/0x21c
    [<801609c2>] do_splice_direct+0x7e/0xa8
    [<80132a5a>] do_sendfile+0x21a/0x45c
    [<80133776>] SyS_sendfile64+0xf6/0xfc
    [<80046186>] csky_systemcall+0x96/0xe0
  • 相关阅读:
    IO 单个文件的多线程拷贝
    day30 进程 同步 异步 阻塞 非阻塞 并发 并行 创建进程 守护进程 僵尸进程与孤儿进程 互斥锁
    day31 进程间通讯,线程
    d29天 上传电影练习 UDP使用 ScketServer模块
    d28 scoket套接字 struct模块
    d27网络编程
    d24 反射,元类
    d23 多态,oop中常用的内置函数 类中常用内置函数
    d22 封装 property装饰器 接口 抽象类 鸭子类型
    d21天 继承
  • 原文地址:https://www.cnblogs.com/arnoldlu/p/10338850.html
Copyright © 2011-2022 走看看