zoukankan      html  css  js  c++  java
  • 高通sensor库和Linker的死锁问题分析报告

    【问题描述】

    调试NativeHeap泄露时,我们会用到Android Native Heap调试框架。

    在push libc_malloc_debug_leak.so后重启zygote(adb shell stop + adb shell start),会发现系统一直起不来。

    用debuggerd打印system_server的调用栈,可以发现system server的大部分线程都在malloc函数里卡死:

    pid: 4918, tid: 4929, name: Binder_1  >>> system_server <<<
    backtrace:
        #00 pc 00000000000178cc  /system/bin/linker64 (__dl_syscall+28)
        #01 pc 00000000000160c0  /system/bin/linker64 (__dl__ZL33__pthread_mutex_lock_with_timeoutP24pthread_mutex_internal_tPK8timespeci.constprop.0+260)
        #02 pc 00000000000163cc  /system/bin/linker64 (__dl_pthread_mutex_lock+36)
        #03 pc 0000000000003318  /system/bin/linker64 (__dl_dl_iterate_phdr+32)
        #04 pc 000000000003ca94  /system/lib64/libc_malloc_debug_leak.so (_Unwind_Find_FDE+368)
        #05 pc 0000000000039dd4  /system/lib64/libc_malloc_debug_leak.so
        #06 pc 000000000003a998  /system/lib64/libc_malloc_debug_leak.so
        #07 pc 000000000003b1fc  /system/lib64/libc_malloc_debug_leak.so (_Unwind_Backtrace+76)
        #08 pc 0000000000008808  /system/lib64/libc_malloc_debug_leak.so
        #09 pc 0000000000009e10  /system/lib64/libc_malloc_debug_leak.so (leak_malloc+404)
        #10 pc 000000000001bb9c  /system/lib64/libc.so (malloc+20)
        #11 pc 0000000000012770  /system/lib64/libutils.so (_ZN7android12SharedBuffer5allocEm+56)
        #12 pc 0000000000014108  /system/lib64/libutils.so (_ZN7android8String16C1EPKDsm+32)
        #13 pc 00000000000d356c  /system/lib64/libandroid_runtime.so
        #14 pc 0000000073b6286c  /data/dalvik-cache/arm64/system@framework@boot.oat (offset 0x248a000)

    其中有两个线程发生死锁:

    pid: 4918, tid: 4992, name: system_server  >>> system_server <<<
    backtrace:
        #00 pc 000000000001bf9c  /system/lib64/libc.so (syscall+28)
        #01 pc 0000000000066bd4  /system/lib64/libc.so (_ZL33__pthread_mutex_lock_with_timeoutP24pthread_mutex_internal_tPK8timespeci.constprop.0+260)
        #02 pc 0000000000066f5c  /system/lib64/libc.so (pthread_mutex_lock+36)
        #03 pc 000000000000a414  /system/vendor/lib64/libsensor1.so 
        #04 pc 000000000000c774  /system/vendor/lib64/libsensor1.so (sensor1_open+1388)
        #05 pc 000000000002c6b0  /system/vendor/lib64/sensors.ssc.so (_ZN14SensorsContextC2Ev+252)
        #06 pc 0000000000013298  /system/vendor/lib64/sensors.ssc.so
        #07 pc 0000000000003c28  /system/bin/linker64 (__dl__ZN6soinfo13call_functionEPKcPFvvE+104)
        #08 pc 0000000000003d7c  /system/bin/linker64 (__dl__ZN6soinfo10call_arrayEPKcPPFvvEmb+248)
        #09 pc 000000000000a090  /system/bin/linker64 (__dl__Z9do_dlopenPKciPK17android_dlextinfo+436)
        #10 pc 00000000000033ac  /system/bin/linker64 (__dl_dlopen+44)
        #11 pc 0000000000006c70  /system/lib64/hw/sensors.msm8996.so (_ZL17lazy_init_modulesv.part.203+200)
        #12 pc 0000000000006f20  /system/lib64/hw/sensors.msm8996.so (_ZL12open_sensorsPK11hw_module_tPKcPP11hw_device_t+456)
        #13 pc 000000000000cdc0  /system/lib64/libsensorservice.so
        #14 pc 00000000000117bc  /system/lib64/libsensorservice.so
        #15 pc 0000000000012244  /system/lib64/libutils.so (_ZNK7android7RefBase9incStrongEPKv+112)
        #16 pc 000000000001fe90  /system/lib64/libandroid_servers.so (_ZN7android10sensorInitEPv+128)
        #17 pc 0000000000065ee4  /system/lib64/libc.so (_ZL15__pthread_startPv+52)
        #18 pc 000000000001ed44  /system/lib64/libc.so (__start_thread+16)
    
    pid: 4918, tid: 4993, name: system_server  >>> system_server <<<
    backtrace:
        #00 pc 00000000000178cc  /system/bin/linker64 (__dl_syscall+28)
        #01 pc 00000000000160c0  /system/bin/linker64 (__dl__ZL33__pthread_mutex_lock_with_timeoutP24pthread_mutex_internal_tPK8timespeci.constprop.0+260)
        #02 pc 00000000000163cc  /system/bin/linker64 (__dl_pthread_mutex_lock+36)
        #03 pc 0000000000003318  /system/bin/linker64 (__dl_dl_iterate_phdr+32) -linker
        #04 pc 000000000003ca94  /system/lib64/libc_malloc_debug_leak.so (_Unwind_Find_FDE+368)
        #05 pc 0000000000039dd4  /system/lib64/libc_malloc_debug_leak.so
        #06 pc 000000000003a998  /system/lib64/libc_malloc_debug_leak.so
        #07 pc 000000000003b1fc  /system/lib64/libc_malloc_debug_leak.so (_Unwind_Backtrace+76)
        #08 pc 0000000000008808  /system/lib64/libc_malloc_debug_leak.so
        #09 pc 0000000000009e10  /system/lib64/libc_malloc_debug_leak.so (leak_malloc+404)
        #10 pc 000000000001bb9c  /system/lib64/libc.so (malloc+20)
        #11 pc 000000000000bd80  /system/vendor/lib64/libsensor1.so
        #12 pc 0000000000065ee4  /system/lib64/libc.so (_ZL15__pthread_startPv+52)
        #13 pc 000000000001ed44  /system/lib64/libc.so (__start_thread+16)

    死锁过程如下:

    4992线程在加载sensor模块时,由于sensors.ssc.so中定义了全局静态对象SensorsContext SensorsContext::self;

    因此dlopen(4992#10)时,加载完sensors.ssc.so后就会调用SensorsContext的构造函数(4992#05)。

    这个构造函数会创建线程4993,并等待4993线程唤醒自己。

    而dlopen函数会持一个全局锁g_dl_mutex

    void* dlopen(const char* filename, int flags) {
      return dlopen_ext(filename, flags, nullptr);
    }
    
    static void* dlopen_ext(const char* filename, int flags, const android_dlextinfo* extinfo) {
      ScopedPthreadMutexLocker locker(&g_dl_mutex);
      soinfo* result = do_dlopen(filename, flags, extinfo);  
      return result;
    }

    4993线程在运行过程中,会获取一个sensor模块的全局锁libsensor_cli_data_mutex,然后再调用malloc()函数。

    static sensor1_error_e
    libsensor_read_socket( int fd )
    {
      ...
      pthread_mutex_lock( &libsensor_cli_data_mutex );
      if( (cli_idx = libsensor_get_client_by_fd( fd )) < 0 ) {
        pthread_mutex_unlock( &libsensor_cli_data_mutex );
        return SENSOR1_EBUFFER;
      }
      rx_msg_p = malloc( SENSOR_MAX_MSG_SIZE + sizeof(libsensor_ctl_read_s) -1 );
      ...

    由于打开了HeapLeak调试开关,所以malloc时,会记录当前的调用栈。

    而获取调用栈的unwind方法需要用到linker的dl_iterate_phdr()方法(4993#3)

    int dl_iterate_phdr(int (*cb)(dl_phdr_info* info, size_t size, void* data), void* data) {
      ScopedPthreadMutexLocker locker(&g_dl_mutex);
      return do_dl_iterate_phdr(cb, data);
    }

    这个方法会申请g_dl_mutex锁,而这个锁此时已经被4992线程拿着了,所以4993会卡在dl_iterate_phdr()中。

    此时4992线程还在等待4993线程:

    sensor1_error_e
    sensor1_open( sensor1_handle_s **hndl,
                  sensor1_notify_data_cb_t data_cbf,
                  intptr_t cb_data )
    {
      ...
      if( -1 == ( err = clock_gettime (CLOCK_REALTIME, &open_timeout ) ) ) {
        ...
      } else {
        open_timeout.tv_sec += 1;
        err = sem_timedwait( &open_sem, &open_timeout );   //这里等现线程4993
      }
      if( 0 != err && ETIMEDOUT == errno ) {
        libsensor_client_data_s cli_data;
        LOG_ERROR( "%s: Sem wait timed-out for socket %i", __func__, sockfd );
        libsensor_del_client( sockfd );
        ...

    sem_timedwait()在等待1秒后超时,会调用libsensor_del_client(),

    而这个函数又申请4993线程持有的libsensor_cli_data_mutex锁。

    static int
    libsensor_add_client( libsensor_client_data_s const *cli_data, bool is_wait_clnt )
    {
      ...
      pthread_mutex_lock( &libsensor_cli_data_mutex );

    这样就产生了死锁。

    这个概率看起来是必现的,而由于这两个锁又不是同一个模块的,所以不太好解决。

    似乎无论是HeapLeak机制还是Sensor代码,各自的处理都没有问题。

    由于HeapLeak是Debug机制,所以在HeapLeak中做出让步,更合理一些。

    【解决方案】

    1、获取调用栈时,不用unwind方法:

    在64位下,MTK对获取backtrace的方法做了优化,

    可以参考MTK版本里的@bionic/libc/bionic/debug_stacktrace.cpp中的get_backtrace()的实现。

    这个方法通过fp来查找lr,可以大大提高效率,且不会调用unwind,也不会有死锁了。

    2、调用so的构造函数前释放 g_dl_mutex锁:

    @bionic/linker/dlfcn.cpp
    /*static*/ pthread_mutex_t g_dl_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
    static void* dlopen_ext(const char* filename, int flags, const android_dlextinfo* extinfo) {
      //ScopedPthreadMutexLocker locker(&g_dl_mutex);
      soinfo* result = do_dlopen(filename, flags, extinfo);
      return result;
    }
    
    @bionic/linker/linker.cpp
    extern pthread_mutex_t g_dl_mutex;
    soinfo* do_dlopen(const char* name, int flags, const android_dlextinfo* extinfo) {
      ...
      {
           ScopedPthreadMutexLocker locker(&g_dl_mutex);
           soinfo* si = find_library(name, flags, extinfo);
      }
      if (si != nullptr) {
        si->call_constructors();  //这里是so的构造函数
      }
      return si;
    }

     

  • 相关阅读:
    C#图形编程
    深入浅出话事件(上)
    .NET名称空间对应的类集
    Equals() 和运算符 == 的重写准则(C# 编程指南)
    Implement EventArgs
    CLS(公共语言规范)的CLSCompliant(跨语言调用)
    学习MSCOREE.dll是托管程序的入口点
    设计模式之原型模式代码示例
    The disk cannot be added to Cluster Shared Volumes because it does not have any suitable partitions
    RAID小结
  • 原文地址:https://www.cnblogs.com/YYPapa/p/6850532.html
Copyright © 2011-2022 走看看