zoukankan      html  css  js  c++  java
  • docker hung住问题排查

    背景:这个是之前遇到的老问题。

    # systemctl status lxcfs
    ● lxcfs.service - FUSE filesystem for LXC
    Loaded: loaded (/usr/lib/systemd/system/lxcfs.service; enabled; vendor preset: disabled)
    Active: activating (start-post) since Tue 2020-06-23 14:37:50 CST; 5min ago---这个是6月份的案例,其实4月份就出过一次,不过由于运维同学急于重启,我没看到现场。
    Docs: man:lxcfs(1)
    Process: 415455 ExecStopPost=/bin/sh -c if mount |grep "baymax/lxcfs"; then fusermount -u /var/lib/baymax/lxcfs; fi (code=exited, status=0/SUCCESS)
    Main PID: 415526 (lxcfs); : 415529 (lxcfs-remount-c)
    Tasks: 43
    Memory: 28.9M
    CGroup: /system.slice/lxcfs.service
    ├─415526 /usr/bin/lxcfs -o nonempty /var/lib/baymax/lxcfs/
    └─control
    ├─415529 /bin/sh /usr/bin/lxcfs-remount-containers
    ├─416923 /bin/sh /usr/bin/lxcfs-remount-containers
    └─419090 docker exec 1eb2f723b69e sh -c export f=/proc/cpuinfo && test -f /var/lib/baymax/lxcfs/$f && (umount $f; mount --bind...

    然后看下对应的runc:

    # ps -ef |grep -i runc |grep -v shim
    root 172169 138974 0 14:43 pts/2 00:00:00 grep --color -i runc
    root 420924 70170 0 14:37 ? 00:00:00 runc --root /var/run/docker/runtime-runc/moby --log /run/docker/containerd/daemon/io.containerd.runtime.v1.linux/moby/1eb2f723b69e2dba83bc490d3fab66922a13a4787be8bcb4cd486e97843ffef5/log.json --log-format json exec --process /tmp/runc-process904568476 --detach --pid-file /run/docker/containerd/daemon/io.containerd.runtime.v1.linux/moby/1eb2f723b69e2dba83bc490d3fab66922a13a4787be8bcb4cd486e97843ffef5/4dfeee72cd794ebec396fb8450f8944499cdde99d22054c950e5a80fb56f0968.pid 1eb2f723b69e2dba83bc490d3fab66922a13a4787be8bcb4cd486e97843ffef5
    root 423656 420924 0 14:37 ? 00:00:00 runc init

    然后看对应 423656  的堆栈详细信息:

    PID: 423656  TASK: ffffa0e872d56180  CPU: 28  COMMAND: "runc:[2:INIT]"
     #0 [ffffa13093eb3c78] __schedule at ffffffffb6969a72
        ffffa13093eb3c80: 0000000000000086 ffffa13093eb3fd8 
        ffffa13093eb3c90: ffffa13093eb3fd8 ffffa13093eb3fd8 
        ffffa13093eb3ca0: 000000000001ab80 ffffa0d4f6a12080 
        ffffa13093eb3cb0: 0000000000000046 0000000000000000 
        ffffa13093eb3cc0: ffffa13093eb3d38 00000000e3a7c164 
        ffffa13093eb3cd0: 0000000000000246 ffffa1032ad42000 
        ffffa13093eb3ce0: ffffa1032ad42028 0000000000000000 
        ffffa13093eb3cf0: 0000000000000001 0000000000000000 
        ffffa13093eb3d00: ffffa13093eb3d10 ffffffffb6969f19 
     #1 [ffffa13093eb3d08] schedule at ffffffffb6969f19
        ffffa13093eb3d10: ffffa13093eb3d60 ffffffffb644bd50 
     #2 [ffffa13093eb3d18] pipe_wait at ffffffffb644bd50
        ffffa13093eb3d20: 0000000000000000 ffffa0e872d56180 
        ffffa13093eb3d30: ffffffffb62c3f50 ffffa0f072d87108 
        ffffa13093eb3d40: ffffa1032ad42030 00000000e3a7c164 
        ffffa13093eb3d50: ffffa1032ad42000 0000000000000010 -----分析堆栈,pipe的inode压栈在此
        ffffa13093eb3d60: ffffa13093eb3de8 ffffffffb644bff9 
     #3 [ffffa13093eb3d68] pipe_write at ffffffffb644bff9
        ffffa13093eb3d70: ffffa1032ad42028 ffffa0e872d56180 
        ffffa13093eb3d80: ffffa13093eb3df8 0000000000000000 
        ffffa13093eb3d90: ffffa10224b2c000 ffffffff00000000 
        ffffa13093eb3da0: ffffa12f3d7df300 00000ff1d12d8867 
        ffffa13093eb3db0: 0000000000000000 00000000e3a7c164 
        ffffa13093eb3dc0: ffffa13093eb3f18 000000c000008bf0 
        ffffa13093eb3dd0: ffffa13093eb3f18 0000000000000010 
        ffffa13093eb3de0: 0000000000000000 ffffa13093eb3ec0 
        ffffa13093eb3df0: ffffffffb6441c13 
     #4 [ffffa13093eb3df0] do_sync_write at ffffffffb6441c13
        ffffa13093eb3df8: 000000c000008bf0 0000000000000010 
        ffffa13093eb3e08: 0000000000000001 ffffa12f3d7df300 
        ffffa13093eb3e18: 0000000000000000 0000000000000000 
        ffffa13093eb3e28: 0000000000000000 ffffa0e872d56180 
        ffffa13093eb3e38: 0000000000000000 0000000000000000 
        ffffa13093eb3e48: 0000000000000000 0000000000000000 
        ffffa13093eb3e58: 0000000000000010 0000000000000000 
        ffffa13093eb3e68: 0000000000000010 0000000000000000 
        ffffa13093eb3e78: 0000000000000000 0000000000000000 
        ffffa13093eb3e88: 0000000000000000 0000000000000000 
        ffffa13093eb3e98: 0000000000000000 0000000000000000 
        ffffa13093eb3ea8: 0000000000000000 00000000e3a7c164 
        ffffa13093eb3eb8: ffffa12f3d7df300 ffffa13093eb3f00 
        ffffa13093eb3ec8: ffffffffb6442700 
     #5 [ffffa13093eb3ec8] vfs_write at ffffffffb6442700
        ffffa13093eb3ed0: 0000000000000000 ffffa12f3d7df300 
        ffffa13093eb3ee0: 0000000000000000 000000c000008bf0 
        ffffa13093eb3ef0: 0000000000000010 0000000000000000 
        ffffa13093eb3f00: ffffa13093eb3f48 ffffffffb644351f 
     #6 [ffffa13093eb3f08] sys_write at ffffffffb644351f
        ffffa13093eb3f10: ffffa13093eb3f48 0000000000000000 
        ffffa13093eb3f20: 00000000e3a7c164 0000000000000000 
        ffffa13093eb3f30: 0000000000000000 0000000000000000 
        ffffa13093eb3f40: 0000000000000000 0000000000000000 
        ffffa13093eb3f50: ffffffffb6976ddb 
     #7 [ffffa13093eb3f50] system_call_fastpath at ffffffffb6976ddb
        RIP: 000000000045b8a5  RSP: 000000c000008be8  RFLAGS: 00010206
        RAX: 0000000000000001  RBX: 0000000000000000  RCX: 000000c000000000
        RDX: 0000000000000010  RSI: 000000c000008bf0  RDI: 0000000000000002
        RBP: 000000c000008b90   R8: 0000000000000001   R9: 00000000006c0fab
        R10: 0000000000000000  R11: 0000000000000202  R12: 0000000000000000
        R13: 0000000000000000  R14: 000000000086d0d8  R15: 0000000000000000
        ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

    看情况是卡在pipe的write,然后看下它打开的文件,找到对应的inode信息:

    PID: 423656  TASK: ffffa0e872d56180  CPU: 28  COMMAND: "runc:[2:INIT]"
    ROOT: /rootfs    CWD: /rootfs
     FD       FILE            DENTRY           INODE       TYPE PATH
      0 ffffa0feca33cf00 ffffa1333b800240 ffffa1333f568850 CHR  /dev/null
      1 ffffa1031adba700 ffffa0f10efb83c0 ffffa0d78de55c80 FIFO 
      2 ffffa12f3d7df300 ffffa0f10efb8a80 ffffa0d78de56f00 FIFO -----对应那个pipe
      3 ffffa133026b9700 ffffa11382e2f080 ffffa12a2b07ad30 SOCK UNIX

    验证下这个pipe:

    crash> struct file.private_data ffffa12f3d7df300
      private_data = 0xffffa1032ad42000
    crash> pipe_inode_info 0xffffa1032ad42000--------和上面的堆栈对的上
    struct pipe_inode_info {
      mutex = {
        count = {
          counter = 1
        }, 
        wait_lock = {
          {
            rlock = {
              raw_lock = {
                val = {
                  counter = 0
                }
              }
            }
          }
        }, 
        wait_list = {
          next = 0xffffa1032ad42008, 
          prev = 0xffffa1032ad42008
        }, 
        owner = 0x0, 
        {
          osq = {
            tail = {
              counter = 0
            }
          }, 
          __UNIQUE_ID_rh_kabi_hide3 = {
            spin_mlock = 0x0
          }, 
          {<No data fields>}
        }
      }, 
      wait = {
        lock = {
          {
            rlock = {
              raw_lock = {
                val = {
                  counter = 0
                }
              }
            }
          }
        }, 
        task_list = {
          next = 0xffffa13093eb3d38, 
          prev = 0xffffa0f072d87108
        }
      }, 
      nrbufs = 1, ----只有一个buf,说明pipe创建的时候,page不够,这个主要受限于 pipe-user-pages-soft的默认配置,
      curbuf = 0, 
      buffers = 1, 
      readers = 1, 
      writers = 1, 
      files = 2, 
      waiting_writers = 1, 
      r_counter = 1, 
      w_counter = 1, 
      tmp_page = 0x0, 
      fasync_readers = 0x0, 
      fasync_writers = 0x0, 
      bufs = 0xffffa132c3a3d100, 
      user = 0xffffffffb6e4d700
    }

    看下pipe中的内容:

    crash> pipe_buffer 0xffffa132c3a3d100
    struct pipe_buffer {
      page = 0xffffe392f992cb00, 
      offset = 0, 
      len = 4081, ---内容的长度
      ops = 0xffffffffb6a2e000, 
      flags = 0, 
      private = 0
    }
    
    crash> kmem -p |grep ffffe392f992cb00
    ffffe392f992cb00 2e64b2c000                0        0  1 2fffff00000000
    
    
    crash> rd  -a -p 2e64b2c000 4081-----------这个4081就是上面的长度
          2e64b2c000:  runtime/cgo: pthread_create failed: Resource temporarily una
          2e64b2c03c:  vailable
          2e64b2c045:  SIGABRT: abort-----------------这个看公众号说是内存不足导致,还贴了一段其他的oom来凑数,其实是因为线程数限制导致
          2e64b2c054:  PC=0x6c0fab m=0 sigcode=18446744073709551610
          2e64b2c082:  goroutine 0 [idle]:
          2e64b2c096:  runtime: unknown pc 0x6c0fab
          2e64b2c0b3:  stack: frame={sp:0x7ffc54fb5b18, fp:0x0} stack=[0x7ffc547b6f
          2e64b2c0ef:  a8,0x7ffc54fb5fd0)
          2e64b2c102:  00007ffc54fb5a18:  0000000000004000  0000000000000000 
          2e64b2c139:  00007ffc54fb5a28:  0000000000d0eb80  00007fe3c913f000 
          2e64b2c170:  00007ffc54fb5a38:  00007ffc54fb5a58  00007ffc54fb5a88 
          2e64b2c1a7:  00007ffc54fb5a48:  000000000040eb32 <runtime.persistentalloc
          2e64b2c1e3:  +130>  00007ffc54fb5a60 
          2e64b2c1fc:  00007ffc54fb5a58:  00007ffc54fb5aa0  00007ffc54fb5ab0 
          2e64b2c233:  00007ffc54fb5a68:  0000000000000040  0000000000000040 
          2e64b2c26a:  00007ffc54fb5a78:  0000000000000001  0000000000000002 
          2e64b2c2a1:  00007ffc54fb5a88:  00000000006b7ebc  000000000041a229 <runti
          2e64b2c2dd:  me.(*fixalloc).alloc+265> 
          2e64b2c2f8:  00007ffc54fb5a98:  000000000045bdde <runtime.callCgoMmap+62>
          2e64b2c334:    00007ffc54fb5aa0 
          2e64b2c348:  00007ffc54fb5aa8:  0000000000d0eb80  0000000054fb5af0 
          2e64b2c37f:  00007ffc54fb5ab8:  0000000000454d48 <runtime.mmap.func1+88> 
          2e64b2c3bb:   000000000041a19b <runtime.(*fixalloc).alloc+123> 
          2e64b2c3ee:  00007ffc54fb5ac8:  00007fe3c913f000  0000000000000030 
          2e64b2c425:  00007ffc54fb5ad8:  0000000000000030  0000000000000030 
          2e64b2c45c:  00007ffc54fb5ae8:  0000000000cd4c28  00000000008814b6 
          2e64b2c493:  00007ffc54fb5af8:  0000000001cd3db0  0000000000000011 
          2e64b2c4ca:  00007ffc54fb5b08:  000000000086d0d8  0000000000000000 
          2e64b2c501:  00007ffc54fb5b18: <00000000006d6198  0000000000000020 
          2e64b2c538:  00007ffc54fb5b28:  0000000000000000  0000000000000000 
          2e64b2c56f:  00007ffc54fb5b38:  0000000000000000  0000000000000000 
          2e64b2c5a6:  00007ffc54fb5b48:  0000000000000000  0000000000000000 
          2e64b2c5dd:  00007ffc54fb5b58:  0000000000000000  0000000000000000 
          2e64b2c614:  00007ffc54fb5b68:  0000000000000000  0000000000000000 
          2e64b2c64b:  00007ffc54fb5b78:  0000000000000000  0000000000000000 
          2e64b2c682:  00007ffc54fb5b88:  0000000000000000  0000000000000000 
          2e64b2c6b9:  00007ffc54fb5b98:  0000000000000000  000000000000000d 
          2e64b2c6f0:  00007ffc54fb5ba8:  00000000006c01bd  0000000000000000 
          2e64b2c727:  00007ffc54fb5bb8:  00000000006e97e3  00007fe3c913b558 
          2e64b2c75e:  00007ffc54fb5bc8:  0000000000cd4580  0000000000000001 
          2e64b2c795:  00007ffc54fb5bd8:  0000000000cd4603  0000000000a9d760 
          2e64b2c7cc:  00007ffc54fb5be8:  00000000006ea87b  0000000000cd4580 
          2e64b2c803:  00007ffc54fb5bf8:  000000000000000a  0000000001cd3db0 
          2e64b2c83a:  00007ffc54fb5c08:  0000000000000011  000000000086d0d8 
          2e64b2c871:  runtime: unknown pc 0x6c0fab
          2e64b2c88e:  stack: frame={sp:0x7ffc54fb5b18, fp:0x0} stack=[0x7ffc547b6f
          2e64b2c8ca:  a8,0x7ffc54fb5fd0)
          2e64b2c8dd:  00007ffc54fb5a18:  0000000000004000  0000000000000000 
          2e64b2c914:  00007ffc54fb5a28:  0000000000d0eb80  00007fe3c913f000 
          2e64b2c94b:  00007ffc54fb5a38:  00007ffc54fb5a58  00007ffc54fb5a88 
          2e64b2c982:  00007ffc54fb5a48:  000000000040eb32 <runtime.persistentalloc
          2e64b2c9be:  +130>  00007ffc54fb5a60 
          2e64b2c9d7:  00007ffc54fb5a58:  00007ffc54fb5aa0  00007ffc54fb5ab0 
          2e64b2ca0e:  00007ffc54fb5a68:  0000000000000040  0000000000000040 
          2e64b2ca45:  00007ffc54fb5a78:  0000000000000001  0000000000000002 
          2e64b2ca7c:  00007ffc54fb5a88:  00000000006b7ebc  000000000041a229 <runti
          2e64b2cab8:  me.(*fixalloc).alloc+265> 
          2e64b2cad3:  00007ffc54fb5a98:  000000000045bdde <runtime.callCgoMmap+62>
          2e64b2cb0f:    00007ffc54fb5aa0 
          2e64b2cb23:  00007ffc54fb5aa8:  0000000000d0eb80  0000000054fb5af0 
          2e64b2cb5a:  00007ffc54fb5ab8:  0000000000454d48 <runtime.mmap.func1+88> 
          2e64b2cb96:   000000000041a19b <runtime.(*fixalloc).alloc+123> 
          2e64b2cbc9:  00007ffc54fb5ac8:  00007fe3c913f000  0000000000000030 
          2e64b2cc00:  00007ffc54fb5ad8:  0000000000000030  0000000000000030 
          2e64b2cc37:  00007ffc54fb5ae8:  0000000000cd4c28  00000000008814b6 
          2e64b2cc6e:  00007ffc54fb5af8:  0000000001cd3db0  0000000000000011 
          2e64b2cca5:  00007ffc54fb5b08:  000000000086d0d8  0000000000000000 
          2e64b2ccdc:  00007ffc54fb5b18: <00000000006d6198  0000000000000020 
          2e64b2cd13:  00007ffc54fb5b28:  0000000000000000  0000000000000000 
          2e64b2cd4a:  00007ffc54fb5b38:  0000000000000000  0000000000000000 
          2e64b2cd81:  00007ffc54fb5b48:  0000000000000000  0000000000000000 
          2e64b2cdb8:  00007ffc54fb5b58:  0000000000000000  0000000000000000 
          2e64b2cdef:  00007ffc54fb5b68:  0000000000000000  0000000000000000 
          2e64b2ce26:  00007ffc54fb5b78:  0000000000000000  0000000000000000 
          2e64b2ce5d:  00007ffc54fb5b88:  0000000000000000  0000000000000000 
          2e64b2ce94:  00007ffc54fb5b98:  0000000000000000  000000000000000d 
          2e64b2cecb:  00007ffc54fb5ba8:  00000000006c01bd  0000000000000000 
          2e64b2cf02:  00007ffc54fb5bb8:  00000000006e97e3  00007fe3c913b558 
          2e64b2cf39:  00007ffc54fb5bc8:  0000000000cd4580  0000000000000001 
          2e64b2cf70:  00007ffc54fb5bd8:  0000000000cd4603  0000000000a9d760 
          2e64b2cfa7:  00007ffc54fb5be8:  00000000006ea87b  0000000000cd4580 
          2e64b2cfde:  00007ffc54fb5bf8:  

    然后看下对端为啥没有来读:

    crash> pipe_inode_info.wait 0xffffa1032ad42000
      wait = {
        lock = {
          {
            rlock = {
              raw_lock = {
                val = {
                  counter = 0
                }
              }
            }
          }
        }, 
        task_list = {
          next = 0xffffa13093eb3d38, --------__wait_queue的task_list链串在此
          prev = 0xffffa0f072d87108
        }
      }
    crash> __wait_queue 0xffffa13093eb3d20
    struct __wait_queue {
      flags = 0, 
      private = 0xffffa0e872d56180, ----对应的就是 423656
      func = 0xffffffffb62c3f50, 
      task_list = {
        next = 0xffffa0f072d87108, 
        prev = 0xffffa1032ad42030
      }
    }
     

    根据fd的对端信息,可以找到其父进程,也就是shim进程是等待runc退出再去读取pipe,而runc又因为pipe容量不够而不退出,所以形成了死锁。

    我们的解决方案是:

    1.增加 pipe-user-pages-soft 的配置。

    2.监控user_struct.pipe_bufs 的用量。

    3.建议不要去动shim 中等待runc退出在读取pipe的逻辑,除非大的故障,谁吃饱了没事去升级一遍containerd-shim。

    4.runc存活时间监控。

    ps:docker hung住的问题案例很多,比如删除容器的时候遇到容器内的进程D状态等等。

  • 相关阅读:
    连续子数组的最大和
    最小的K个数
    数组中出现次数超过一半的数字
    字符串的排列
    二叉搜索树与双向链表
    复杂链表的复制
    二叉树中和为某一值的路径
    二叉搜索树的后序遍历序列
    Xcode5下去除Icon高光
    Unity3D-基本导航(NavMesh)功能实现
  • 原文地址:https://www.cnblogs.com/10087622blog/p/13888215.html
Copyright © 2011-2022 走看看