参考
https://www.ibm.com/developerworks/cn/linux/l-cn-deadlock/index.html
https://blog.csdn.net/peng314899581/article/details/79064616
https://www.cnblogs.com/youxin/p/8837771.html
https://www.jianshu.com/p/d451793cab4c?utm_source=oschina-app
http://blog.sina.com.cn/s/blog_a2a6dd380102xtec.html
https://blog.csdn.net/wanxuexiang/article/details/88382808
https://ethanhao.github.io/c++11,/gdb,/multithread,/2017/03/03/Deadlock-detecting-using-GDB-Copy.html
前沿
Windows下死锁的解决方法已经很熟悉了。首先,Windows via C/C++中,提供了一个工程-LockCop,附加到一个进程,判断是否有死锁。死锁的现象行为有显著的特点,程序表面上看上去一切正常,但是某些信息或是消息发送过去后,无法处理。一般我们用LockCop判断是否有死锁,发现有死锁之后,用Visual Studio远程附加到进程调试,看看对应线程卡在哪个位置。一般都会卡在加锁的位置,然后看看两个死锁的线程代码上几步,是不是相互锁定了对方现在正在请求的锁。这样就可以很快的查到死锁的问题。
Linux下调查死锁的方法与Windows类似,也是先确认是否死锁,然后找到哪两个线程死锁,然后调试具体看线程卡在哪一步。
代码
这个代码创建了4个线程,两个死锁,两个不断操作数组
#include <unistd.h>
#include <pthread.h>
#include <string.h>
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex3 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex4 = PTHREAD_MUTEX_INITIALIZER;
static int sequence1 = 0;
static int sequence2 = 0;
int func1()
{
pthread_mutex_lock(&mutex1);
++sequence1;
sleep(1);
pthread_mutex_lock(&mutex2);
++sequence2;
pthread_mutex_unlock(&mutex2);
pthread_mutex_unlock(&mutex1);
return sequence1;
}
int func2()
{
pthread_mutex_lock(&mutex2);
++sequence2;
sleep(1);
pthread_mutex_lock(&mutex1);
++sequence1;
pthread_mutex_unlock(&mutex1);
pthread_mutex_unlock(&mutex2);
return sequence2;
}
void* thread1(void* arg)
{
while (1)
{
int iRetValue = func1();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* thread2(void* arg)
{
while (1)
{
int iRetValue = func2();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* thread3(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
void* thread4(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
int main()
{
pthread_t tid[4];
if (pthread_create(&tid[0], NULL, &thread1, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[1], NULL, &thread2, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[2], NULL, &thread3, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[3], NULL, &thread4, NULL) != 0)
{
_exit(1);
}
sleep(5);
//pthread_cancel(tid[0]);
pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);
pthread_join(tid[2], NULL);
pthread_join(tid[3], NULL);
pthread_mutex_destroy(&mutex1);
pthread_mutex_destroy(&mutex2);
pthread_mutex_destroy(&mutex3);
pthread_mutex_destroy(&mutex4);
return 0;
}
编译运行
第一种方式 strace
找到我们的进程
$ ps aux -T |grep a.out
root 6794 6794 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6795 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6796 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6797 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6798 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6800 6800 0.0 0.0 3216 892 pts/1 R+ 14:23 0:00 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn --exclude-dir=.idea --exclude-dir=.tox a.out
我们看到6794这个进程,也就是我们跑的程序,有5个线程,因为一个程序起来的主线程,然后又申请了4个子线程。
用strace查看每个线程的状态
# root @ debian in ~ [14:27:42] C:130
$ strace -p 6794
strace: Process 6794 attached
futex(0x7f1d36d1a9d0, FUTEX_WAIT, 6795, NULL^Cstrace: Process 6794 detached
<detached ...>
# root @ debian in ~ [14:27:46] C:130
$ strace -p 6795
strace: Process 6795 attached
futex(0x5608207030e0, FUTEX_WAIT_PRIVATE, 2, NULL^Cstrace: Process 6795 detached
<detached ...>
# root @ debian in ~ [14:27:51] C:130
$ strace -p 6796
strace: Process 6796 attached
futex(0x5608207030a0, FUTEX_WAIT_PRIVATE, 2, NULL^Cstrace: Process 6796 detached
<detached ...>
# root @ debian in ~ [14:27:55] C:130
$ strace -p 6797
strace: Process 6797 attached
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, ^Cstrace: Process 6797 detached
<detached ...>
# root @ debian in ~ [14:28:02] C:130
$ strace -p 6798
strace: Process 6798 attached
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, ^Cstrace: Process 6798 detached
<detached ...>
我们知道主线程肯定是阻塞的或是循环的,不然程序就执行完退出了,所以6794是一个等待状态,多次调用strace可以看到6795和6796也一直是等待状态,按照正常的程序执行,很难在抓取信息的时候看到是加锁等待状态,更不用说多次执行都是同一个等待状态,这基本上就表示是死锁了。后面6797和6798符合代码的执行流程,就是sleep,然后做一些操作,strace可以记录到每次调用系统nanosleep的日志。有关futex的更多信息请参考futex
gdb调试
$ gdb
GNU gdb (Debian 8.2.1-2+b3) 8.2.1
Copyright (C) 2018 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) attach 7291
Attaching to process 7291
[New LWP 7292]
[New LWP 7293]
[New LWP 7294]
[New LWP 7295]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0, block=<optimized out>) at pthread_join_common.c:89
89 pthread_join_common.c: No such file or directory.
(gdb) info threads
Id Target Id Frame
* 1 Thread 0x7fea5c0d6740 (LWP 7291) "a.out" 0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0,
block=<optimized out>) at pthread_join_common.c:89
2 Thread 0x7fea5c0d5700 (LWP 7292) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
3 Thread 0x7fea5b8d4700 (LWP 7293) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
4 Thread 0x7fea5b0d3700 (LWP 7294) "a.out" 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5b0d2e20,
remaining=remaining@entry=0x7fea5b0d2e20) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
5 Thread 0x7fea5a8d2700 (LWP 7295) "a.out" 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5a8d1e20,
remaining=remaining@entry=0x7fea5a8d1e20) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
(gdb) thread apply all bt
Thread 5 (Thread 0x7fea5a8d2700 (LWP 7295)):
#0 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5a8d1e20, remaining=remaining@entry=0x7fea5a8d1e20)
at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00007fea5c1a162a in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#2 0x0000558bb85f732c in thread4 (arg=0x0) at test.cpp:80
#3 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#4 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 4 (Thread 0x7fea5b0d3700 (LWP 7294)):
#0 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5b0d2e20, remaining=remaining@entry=0x7fea5b0d2e20)
at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00007fea5c1a162a in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#2 0x0000558bb85f72e7 in thread3 (arg=0x0) at test.cpp:69
#3 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#4 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 3 (Thread 0x7fea5b8d4700 (LWP 7293)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
#1 0x00007fea5c5c7714 in __GI___pthread_mutex_lock (mutex=0x558bb85fa0a0 <mutex1>) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000558bb85f724e in func2 () at test.cpp:31
#3 0x0000558bb85f72b5 in thread2 (arg=0x0) at test.cpp:56
#4 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#5 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 2 (Thread 0x7fea5c0d5700 (LWP 7292)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
#1 0x00007fea5c5c7714 in __GI___pthread_mutex_lock (mutex=0x558bb85fa0e0 <mutex2>) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000558bb85f71ea in func1 () at test.cpp:18
#3 0x0000558bb85f728e in thread1 (arg=0x0) at test.cpp:43
#4 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#5 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 1 (Thread 0x7fea5c0d6740 (LWP 7291)):
#0 0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0, block=<optimized out>) at pthread_join_common.c:89
#1 0x0000558bb85f7444 in main () at test.cpp:110
(gdb) p mutex1
$1 = {__data = {__lock = 2, __count = 0, __owner = 7292, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "