最近遇到一例3.10内核的crash:
[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace: [ 4109.683503] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [ 4109.683507] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [ 4109.683512] [<ffffffff811eaff7>] ? unlazy_walk+0x87/0x140 [ 4109.683515] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [ 4109.683522] [<ffffffff81636235>] lookup_slow+0x33/0xa7 [ 4109.683525] [<ffffffff811f0483>] path_lookupat+0x773/0x7a0 [ 4109.683531] [<ffffffff81274454>] ? SYSC_semtimedop+0x264/0xd10 [ 4109.683537] [<ffffffff810c0cbb>] ? should_numa_migrate_memory+0x5b/0x150 [ 4109.683542] [<ffffffff811c3345>] ? kmem_cache_alloc+0x35/0x1d0 [ 4109.683545] [<ffffffff811f117f>] ? getname_flags+0x4f/0x1a0 [ 4109.683548] [<ffffffff811f04db>] filename_lookup+0x2b/0xc0 [ 4109.683550] [<ffffffff811f22a7>] user_path_at_empty+0x67/0xc0 [ 4109.683556] [<ffffffff811993c7>] ? handle_mm_fault+0x607/0xf40 [ 4109.683559] [<ffffffff811f2311>] user_path_at+0x11/0x20 [ 4109.683564] [<ffffffff811e5b53>] vfs_fstatat+0x63/0xc0 [ 4109.683567] [<ffffffff811e60be>] SYSC_newstat+0x2e/0x60 [ 4109.683570] [<ffffffff81273ced>] ? do_smart_update+0x11d/0x130 [ 4109.683575] [<ffffffff8101c899>] ? read_tsc+0x9/0x10 [ 4109.683580] [<ffffffff8110d336>] ? __audit_syscall_exit+0x1e6/0x280 [ 4109.683583] [<ffffffff811e639e>] SyS_newstat+0xe/0x10 [ 4109.683586] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
看来在一个mutex上处于uninterrupt同步等待超过了时间。
看堆栈是stat调用访问文件的元数据,
crash> struct nameidata ffff881f320bfdc0 struct nameidata { path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 }, last = { { { hash = 3495247374, len = 27 }, hash_len = 119459364366 }, name = 0xffff881040c56031 "00fe_0002_011712b3_00000001"-----------访问的文件名 }, root = { mnt = 0xffff883fccff1020, dentry = 0xffff881fcc7ff980 }, inode = 0xffff881571308e78, flags = 1, seq = 2, last_type = 0, depth = 0, saved_names = {0x7f30b11923e8 <Address 0x7f30b11923e8 out of bounds>, 0xffff881f8a6a2c40 "g", 0xffff881f46cc0438 "", 0xffff881fce793e80 "260334341W 37210377377310354242o 37210377377 20E-D 37210377377200231 01201377377377377 20363 31201377377377377", 0x80 <Address 0x80 out of bounds>, 0xffff881f320bfec0 "260 04|a(177", 0xffffffff811993c7 <handle_mm_fault+1543> "A211302351327375377377220I211370L211341H211332L211356L211367D211M300350310] 03", 0x5cd06a0 <Address 0x5cd06a0 out of bounds>, 0x0} }
files 看不到这个文件,说明该进程没有open 这个file。
要找到对应的mutex:
crash> struct nameidata.path ffff881f320bfdc0 path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 } crash> dentry.d_inode 0xffff881f502552c0 d_inode = 0xffff881571308e78 crash> struct -x dentry.d_inode struct dentry { [0x30] struct inode *d_inode; } crash> inode.i_mutex 0xffff881571308e78 i_mutex = { count = { counter = -1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 100009462, tickets = { head = 1526, tail = 1526 } } } } } }, wait_list = { next = 0xffff8819fc5b3c90, prev = 0xffff881f5097bc40 }, owner = 0xffff881f52764500,-------spinlock在没开启debug的情况下不会设置owner,而mutex一般看owner就知道谁持有 { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }
然后看一下owner为啥拿了互斥量不放:
task 0xffff881f52764500 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" struct task_struct { state = 2, stack = 0xffff881a5c19c000,
查看它的堆栈:
crash> bt -f 47270 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" #0 [ffff881a5c19f7f0] __schedule at ffffffff8163df9b ffff881a5c19f7f8: 0000000000000082 ffff881a5c19ffd8 ffff881a5c19f808: ffff881a5c19ffd8 ffff881a5c19ffd8 ffff881a5c19f818: 00000000000147c0 ffff881fd3267300 ffff881a5c19f828: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f838: 7fffffffffffffff ffff881f52764500 ffff881a5c19f848: ffff881fd042b500 ffff881a5c19f860 ffff881a5c19f858: ffffffff8163e879 #1 [ffff881a5c19f858] schedule at ffffffff8163e879 ffff881a5c19f860: ffff881a5c19f908 ffffffff8163c329 #2 [ffff881a5c19f868] schedule_timeout at ffffffff8163c329 ffff881a5c19f870: 7fffffffffffffff ffff881a5c19f900 ffff881a5c19f880: 0000000000000000 0000000000000000 ffff881a5c19f890: 0000000000000000 ffff8813ee889980 ffff881a5c19f8a0: ffff881a5c19f8b8 ffffffff812cea44 ffff881a5c19f8b0: 0000000000000001 ffff881a5c19f968 ffff881a5c19f8c0: ffffffffa07a51c4 000010005c19f8d8 ffff881a5c19f8d0: 0000000000000000 00000000edb89915 ffff881a5c19f8e0: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f8f0: 7fffffffffffffff ffff881f52764500 ffff881a5c19f900: ffff881fd042b500 ffff881a5c19f968 ffff881a5c19f910: ffffffff8163ec46 #3 [ffff881a5c19f910] wait_for_completion at ffffffff8163ec46 ffff881a5c19f918: 0000000000000001 ffff881f52764500 ffff881a5c19f928: ffffffff810b9930 ffff8813ee889a68 ffff881a5c19f938: ffff8813ee889a68 00000000edb89915 ffff881a5c19f948: ffff8813ee889980 0000000000000001 ffff881a5c19f958: ffffffffa07a6ce3 0000000000000000 ffff881a5c19f968: ffff881a5c19f990 ffffffffa07a6b74 #4 [ffff881a5c19f970] xfs_buf_submit_wait at ffffffffa07a6b74 [xfs]---------同步的buffer io提交函数 ffff881a5c19f978: ffff8813ee889980 0000000000000001-----------ffff8813ee889980 为xfs_buf ffff881a5c19f988: ffffffffa07d3829 ffff881a5c19f9a0 ffff881a5c19f998: ffffffffa07a6ce3 #5 [ffff881a5c19f998] _xfs_buf_read at ffffffffa07a6ce3 [xfs] ffff881a5c19f9a0: ffff881a5c19f9e0 ffffffffa07a6dda #6 [ffff881a5c19f9a8] xfs_buf_read_map at ffffffffa07a6dda [xfs]----------,走这个分支说明tp为NULL,根据map找xfs_buf,找不到则申请一个,扇区为1960485944,长度为8 ffff881a5c19f9b0: ffffffffa07e6cc0 0000000000000000 ffff881a5c19f9c0: 0000000000000000 ffff883f001f7800 ffff881a5c19f9d0: ffff881a5c19fa60 ffff881fd042b500 ffff881a5c19f9e0: ffff881a5c19fa20 ffffffffa07d3829 #7 [ffff881a5c19f9e8] xfs_trans_read_buf_map at ffffffffa07d3829 [xfs]----根据trans和xfs_buf_map调用xfs_buf_read_map 来获取xfs_buf ffff881a5c19f9f0: 00000000edb89915 ffff881a5c19fb10 ffff881a5c19fa00: 0000000000000000 ffff881571308cc0 ffff881a5c19fa10: ffffffffa07e6cc0 ffff881a5c19fa60 ffff881a5c19fa20: ffff881a5c19faa0 ffffffffa0786204 #8 [ffff881a5c19fa28] xfs_da_read_buf at ffffffffa0786204 [xfs] ffff881a5c19fa30: ffff881a5c19fa50 ffffffffa07e6cc0 ffff881a5c19fa40: 0000000000000000 00000001a07e6650 ffff881a5c19fa50: 0000000000000000 ffff881a5c19fa60 ffff881a5c19fa60: 0000000074daa438 ffff881a00000008 ffff881a5c19fa70: 00000000edb89915 0000000000000000 ffff881a5c19fa80: ffff881a5c19fb10 ffff8810cfbd1b00 ffff881a5c19fa90: 0000000000000016 000000000000006c ffff881a5c19faa0: ffff881a5c19fad0 ffffffffa078baa6 #9 [ffff881a5c19faa8] xfs_dir3_data_read at ffffffffa078baa6 [xfs] ffff881a5c19fab0: ffffffffa07e6cc0 ffff881f5a13ad00 ffff881a5c19fac0: ffff881f5a13ad00 ffff8814905453a0 ffff881a5c19fad0: ffff881a5c19fb58 ffffffffa078f99e #10 [ffff881a5c19fad8] xfs_dir2_leafn_lookup_for_entry at ffffffffa078f99e [xfs] ffff881a5c19fae0: 0000000172bc1800 ffff881f5a13ad58 ffff881a5c19faf0: ffff883f001f7800 ffff881571308cc0 ffff881a5c19fb00: 0000000000000000 ffff881400000016 ffff881a5c19fb10: 0000000000000000 0080001d00800010 ffff881a5c19fb20: ffff000501873dff 00000000edb89915 ffff881a5c19fb30: ffff881f5a13ad40 ffff881f5a13ad00 ffff881a5c19fb40: ffff881f5a13ad10 ffff8810cfbd1b00 ffff881a5c19fb50: ffff881f5a13ad58 ffff881a5c19fb68 ffff881a5c19fb60: ffffffffa0791177 #11 [ffff881a5c19fb60] xfs_dir2_leafn_lookup_int at ffffffffa0791177 [xfs] ffff881a5c19fb68: ffff881a5c19fbc8 ffffffffa0787726 #12 [ffff881a5c19fb70] xfs_da3_node_lookup_int at ffffffffa0787726 [xfs] ffff881a5c19fb78: ffff881a5c19fbdc ffff881a5c19fcb8 ffff881a5c19fb88: 0000000000000000 ffff000100643ebe ffff881a5c19fb98: 00000000edb89915 ffff8810cfbd1b00 ffff881a5c19fba8: 0000000000000000 ffff8810cfbd1b00 ffff881a5c19fbb8: ffff881f5a13ad00 ffff881a5c19fcb8 ffff881a5c19fbc8: ffff881a5c19fc08 ffffffffa0791ded #13 [ffff881a5c19fbd0] xfs_dir2_node_lookup at ffffffffa0791ded [xfs] ffff881a5c19fbd8: ffff881a5c19fc08 00000000edb89915 ffff881a5c19fbe8: ffff8810cfbd1b00 0000000000000000 ffff881a5c19fbf8: 0000000000000000 0000000000000000 ffff881a5c19fc08: ffff881a5c19fc58 ffffffffa07897b5 #14 [ffff881a5c19fc10] xfs_dir_lookup at ffffffffa07897b5 [xfs] ffff881a5c19fc18: ffff881a5c19fc68 0000000000000008 ffff881a5c19fc28: 00000000edb89915 0000000000000008 ffff881a5c19fc38: ffff881571308cc0 ffff881a5c19fcb8 ffff881a5c19fc48: 0000000000000000 ffff881a5c19fcb0 ffff881a5c19fc58: ffff881a5c19fca0 ffffffffa07b7bb6 #15 [ffff881a5c19fc60] xfs_lookup at ffffffffa07b7bb6 [xfs] ffff881a5c19fc68: ffff881a5c19fc90 00000000edb89915 ffff881a5c19fc78: ffff881fcd1038c0 0000000000008000 ffff881a5c19fc88: ffff881a5c19fdf0 ffff881f502552c0 ffff881a5c19fc98: ffff881a5c19fe50 ffff881a5c19fcd8 ffff881a5c19fca8: ffffffffa07b4a0b #16 [ffff881a5c19fca8] xfs_vn_lookup at ffffffffa07b4a0b [xfs] ffff881a5c19fcb0: ffff881a5c19fe60 ffff881fcd1038f8 ffff881a5c19fcc0: 000000000000001b 00000000edb89915 ffff881a5c19fcd0: ffff881fcd1038c0 ffff881a5c19fcf8 ffff881a5c19fce0: ffffffff811eac3d #17 [ffff881a5c19fce0] lookup_real at ffffffff811eac3d ffff881a5c19fce8: ffff881f502552c0 ffff881a5c19ff28 ffff881a5c19fcf8: ffff881a5c19fda8 ffffffff811ee813 #18 [ffff881a5c19fd00] do_last at ffffffff811ee813---------------do_last-->lookup_open-->lookup_real,对应2905行 ffff881a5c19fd08: ffff881a5c19ff58 ffffffff81274abd ffff881a5c19fd18: 0000000000000000 ffff881f52764500 ffff881a5c19fd28: 0000001f52764500 0000000000000000 ffff881a5c19fd38: ffff881571308e78 ffff88041076d000------------ ffff881571308e78对应的inode ffff881a5c19fd48: ffff881a5c19fde4 ffff881f502552c0-------------父目录的dentry ffff881a5c19fd58: ffff880aed4d5700 ffff880000000024 ffff881a5c19fd68: 01ff880aed4d5700 ffff881a5c19fd80 ffff881a5c19fd78: 00000000edb89915 ffff881a5c19fe50 ffff881a5c19fd88: ffff880aed4d5700 ffff88041076d000 ffff881a5c19fd98: ffff881a5c19ff28 ffff881f52764500 ffff881a5c19fda8: ffff881a5c19fe40 ffffffff811f0be2 #19 [ffff881a5c19fdb0] path_openat at ffffffff811f0be2 ffff881a5c19fdb8: ffffea00515463b0 0000000000000000 ffff881a5c19fdc8: 0000000000000200 ffff883fc04c7080 ffff881a5c19fdd8: 0000004100000000 00000000000000a9 ffff881a5c19fde8: 0000000000000000 00007f2c5676d000 ffff881a5c19fdf8: ffffea00d1d8c2c0 00000000edb89915 ffff881a5c19fe08: 00007f2c5676dff0 00000000edb89915 ffff881a5c19fe18: 00000000ffffff9c ffff88041076d000 ffff881a5c19fe28: ffff881a5c19ff28 0000000000000001 ffff881a5c19fe38: 00000000000036cc ffff881a5c19ff10 ffff881a5c19fe48: ffffffff811f23ab #20 [ffff881a5c19fe48] do_filp_open at ffffffff811f23ab ffff881a5c19fe50: ffff883fcb7e2820 ffff881f502552c0 ffff881a5c19fe60: 0000001bc5e44c0c ffff88041076d031 ffff881a5c19fe70: ffff883fccff1020 ffff881fcc7ff980 ffff881a5c19fe80: ffff881571308e78 0000000200000101 ffff881a5c19fe90: 0000000000000000 0000000000000001 ffff881a5c19fea0: 00007f2c64217510 0000000000000000 ffff881a5c19feb0: 0000000000000000 ffff881a5c19ff00 ffff881a5c19fec0: ffffffff811fef47 ffff883fc175c3c0 ffff881a5c19fed0: 0010000000008000 0000000000000001 ffff881a5c19fee0: 0000000000008000 00000000edb89915 ffff881a5c19fef0: 0000000000000001 00000000000000e3 ffff881a5c19ff00: 00000000ffffff9c ffff88041076d000 ffff881a5c19ff10: ffff881a5c19ff68 ffffffff811dfd53 #21 [ffff881a5c19ff18] do_sys_open at ffffffff811dfd53 ffff881a5c19ff20: ffff881f49292a48 ffff000000008000 ffff881a5c19ff30: 0000010000000024 00000000edb89915 ffff881a5c19ff40: 0000000000000000 00007f2c64217664 ffff881a5c19ff50: 0000000000000002 00007f2c64217653 ffff881a5c19ff60: 00000000000036cc ffff881a5c19ff78 ffff881a5c19ff70: ffffffff811dfe6e #22 [ffff881a5c19ff70] sys_open at ffffffff811dfe6e ffff881a5c19ff78: 00007f2c64217510 ffffffff81649909 #23 [ffff881a5c19ff80] system_call_fastpath at ffffffff81649909 RIP: 00007f2e31a3dc3d RSP: 00007f2c64217358 RFLAGS: 00010202 RAX: 0000000000000002 RBX: ffffffff81649909 RCX: 0000000000010000 RDX: 0000000000000005 RSI: 0000000000000000 RDI: 00007f2c64217510 RBP: 00007f2c64217510 R8: 000000000000ffff R9: 000000000000001f R10: 00007f2e30df7000 R11: 0000000000000293 R12: ffffffff811dfe6e R13: ffff881a5c19ff78 R14: 00000000000036cc R15: 00007f2c64217653 ORIG_RAX: 0000000000000002 CS: 0033 SS: 002b crash>
该进程在一个完成量上等待:
等待的完成量: crash>completion ffff8813ee889a58 struct completion { done = 0,----------------至今未完成,如果完成则为1,由complete 函数修改,并唤醒等待的task wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }
这个完成量其实就是在等待一个io的完成,
对应的调用链:
xfs_buf_read_map--》_xfs_buf_read--》xfs_buf_submit_wait-->_xfs_buf_ioapply-->xfs_buf_ioapply_map-->submit_bio
_xfs_buf_ioapply返回后,执行wait_for_completion,然后就一直等着了
查看对应的xfs_buf:
crash> xfs_buf 0xffff8813ee889980 struct xfs_buf { b_rbnode = { __rb_parent_color = 18446612139147634816, rb_right = 0x0, rb_left = 0x0 }, b_bn = 1960485944,----------扇区号 b_length = 8, b_hold = { counter = 2 }, b_lru_ref = { counter = 1 }, b_flags = 1048577,也就是0x100001,io未完成,没有置为异步标志:#define XBF_ASYNC (1 << 4),#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */,说明需要等待的同步io b_sema = { lock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } }, count = 0, wait_list = { next = 0xffff8813ee8899b8, prev = 0xffff8813ee8899b8 } }, b_lru = { next = 0xffff8813ee8899c8, prev = 0xffff8813ee8899c8 }, b_lru_flags = 0, b_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, b_io_error = 0, b_waiters = { lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, task_list = { next = 0xffff8813ee8899f0, prev = 0xffff8813ee8899f0 } }, b_list = { next = 0xffff8813ee889a00, prev = 0xffff8813ee889a00 }, b_pag = 0xffff881f4b98db00,-------这个xfs_buf关联的ag b_target = 0xffff881fd042b500, b_addr = 0xffff880750046000, b_ioend_work = {----------------------这个是io结束之后,会用到的work_struct data = { counter = 0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, b_ioend_wq = 0xffff881f5ab7c400,-----------ioend之后,初始化对应的b_ioend_work,然后挂到这个workqueue中去 b_iodone = 0x0,-----------------------为NULL, b_iowait = {---------------这个就是完成量的地址 done = 0, wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2,------------这把锁曾经吃过一次亏,记忆深刻, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }, b_fspriv = 0x0, b_transp = 0x0,----------------关联的xfs_trans,传入的tans为NULL b_pages = 0xffff8813ee889a90, b_page_array = {0xffffea001d401180, 0x0}, b_maps = 0xffff8813ee889aa8, __b_map = { bm_bn = 1960485944,----------扇区,由于一个ag是管理个扇区,所以这个扇区对应的是ag1 bm_len = 8------长度 }, b_map_count = 1, b_io_length = 8, b_pin_count = { counter = 0 }, b_io_remaining = { counter = 1------------依然为1 }, b_page_count = 1, b_offset = 0, b_error = 0, b_ops = 0xffffffffa07e6cc0 <xfs_dir3_data_buf_ops>---xfs_buf负责的不同对象,有不同的操作指针
在io正常完成时,回调的 xfs_buf_ioend 会处理xfs_buf的io结果,
if (bp->b_iodone)------我们的buf不满足
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)----我们的buf也不满足
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);----我们的buf该走的流程
xfs_buf是由ag来缓存管理的,xfs使用一颗radix树来管理ag,我们的扇区号除以8就是block编号,每个ag管理的block数是固定的,那么很容易获取到对应的index。
crash> xfs_perag 0xffff881f4b98db00 struct xfs_perag { pag_mount = 0xffff883f001f7800, pag_agno = 1,-------------------------和上面对得上 pag_ref = { counter = 1002 }, pagf_init = 1 '