pstack排查死锁问题
下午修改好代码,本地测试运行一切正常,就打包编译上线了。晚上钉钉报警群突然出现警告日志。运维大佬也赶紧通知我说下午上的服务出现问题,程序好像在哪里卡住了。进程还在运行,但是数据一直解析不了。
查看了系统资源一切正常。无奈马上回滚旧版本继续解析数据,服务不能停啊,影响线上业务。
在dev环境跑起来,一切正常。查看了一下下午修改的地方,主要修改了读取一个全局变量的方式,应该是这个锁有问题。无奈c++用的比较少,检测死锁的工具在网上找了好多都没啥用。还好dev的进程重现了卡死的问题。记录到pid=30.
查看进程30包含的线程
$ ps -T -p 30
PID SPID TIME CMD
30 30 ? 00:00:16 litecoind
30 32 ? 00:00:03 b-JobWorker_Run
30 33 ? 00:00:35 b-JobWorker_Run
30 34 ? 00:00:35 b-JobWorker_Run
30 35 ? 00:00:35 b-JobWorker_Run
30 36 ? 00:00:35 b-JobWorker_Run
30 37 ? 00:00:35 b-JobWorker_Run
30 38 ? 00:00:35 b-JobWorker_Run
30 39 ? 00:00:35 b-JobWorker_Run
30 40 ? 00:00:00 b-scriptch.0
30 41 ? 00:00:00 b-scriptch.1
30 42 ? 00:00:00 b-scriptch.2
30 43 ? 00:00:00 b-scheduler
30 44 ? 00:00:00 b-http
30 45 ? 00:00:00 b-httpworker.0
30 46 ? 00:00:00 b-httpworker.1
30 47 ? 00:00:00 b-httpworker.2
30 48 ? 00:00:00 b-httpworker.3
30 49 ? 00:00:00 b-httpworker.4
30 50 ? 00:00:00 b-httpworker.5
30 51 ? 00:00:00 b-httpworker.6
30 52 ? 00:00:00 b-httpworker.7
30 61 ? 00:00:00 litecoind
30 82 ? 00:00:00 b-torcontrol
30 83 ? 00:00:03 b-net
30 85 ? 00:00:00 b-addcon
30 86 ? 00:00:00 b-opencon
30 87 ? 00:00:10 b-msghand
30 94 ? 00:00:34 b-RunParser
psstack打印所有线程的堆栈信息,日志比较多,这里只显示线程94,87,86的堆栈日志
$ psstack 30
Thread 29 (Thread 0x7fb8c5e7d700 (LWP 94)):
#0 0x00007fb94e456d50 in __GI___nanosleep (requested_time=requested_time@entry=0x7fb8c5e79590, remaining=remaining@entry=0x7fb8c5e79590) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00005635a1d1099b in std::this_thread::sleep_for<long, std::ratio<1l, 1000l> > (__rtime=...) at /usr/include/c++/8/thread:379
#2 JobInstance::Join (this=0x7fb93c431900) at btcexplorer/Job.cpp:28
#3 0x00005635a1d125a6 in JobHandle::Join (this=0x7fb943848080) at btcexplorer/Job.h:112
#4 JobHandle::JoinAll(std::vector<JobHandle, std::allocator<JobHandle> >&) () at btcexplorer/Job.cpp:36
#5 0x00005635a1d86564 in BlockParserManager::ParseUnconfirm (this=this@entry=0x7fb8c5e7c500) at btcexplorer/ParserUnconfirm.cxx:439
#6 0x00005635a1d2f53e in BlockParserManager::DoParseUnconfirm (this=0x7fb8c5e7c500) at btcexplorer/ParserBlockParserManager.cxx:683
#7 RunParser() () at btcexplorer/Parser.cpp:115
#8 0x00007fb94c1596df in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#9 0x00007fb94e44c6db in start_thread (arg=0x7fb8c5e7d700) at pthread_create.c:463
#10 0x00007fb94b81671f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 28 (Thread 0x7fb8c687e700 (LWP 87)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007fb94e44f0f4 in __GI___pthread_mutex_lock (mutex=0x7fb949d5f958) at ../nptl/pthread_mutex_lock.c:115
#2 0x00005635a1caee52 in __gthread_mutex_lock (__mutex=0x7fb8c687b9f8) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:748
#3 __gthread_recursive_mutex_lock (__mutex=0x7fb8c687b9f8) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:810
#4 std::recursive_mutex::lock (this=0x7fb8c687b9f8) at /usr/include/c++/8/mutex:107
#5 std::unique_lock<std::recursive_mutex>::lock (this=0x7fb8c687b9a0, this=0x7fb8c687b9a0) at /usr/include/c++/8/bits/std_mutex.h:267
#6 UniqueLock<AnnotatedMixin<std::recursive_mutex>, std::unique_lock<std::recursive_mutex> >::Enter (pszName=<optimized out>, pszFile=<optimized out>, nLine=<optimized out>, this=0x7fb8c687b9a0) at ./sync.h:143
#7 UniqueLock<AnnotatedMixin<std::recursive_mutex>, std::unique_lock<std::recursive_mutex> >::UniqueLock(AnnotatedMixin<std::recursive_mutex>&, char const*, char const*, int, bool) [clone .constprop.1448] () at ./sync.h:164
#8 0x00005635a1cb008c in CTxMemPool::GetMinFee(unsigned long) const () at txmempool.cpp:1145
#9 0x00005635a1b5a66d in PeerManager::SendMessages(CNode*) () at /usr/include/c++/8/ext/new_allocator.h:79
#10 0x00005635a1b2b3f5 in CConnman::ThreadMessageHandler() () at net.cpp:2242
#11 0x00005635a1b1f04e in std::function<void ()>::operator()() const (this=0x7fb8c687d710) at /usr/include/c++/8/bits/std_function.h:682
#12 TraceThread<std::function<void ()> >(char const*, std::function<void ()>) (name=<optimized out>, func=...) at ./util/system.h:438
#13 0x00005635a1b3da3f in std::__invoke_impl<void, void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> >(std::__invoke_other, void (*&&)(char const*, std::function<void ()>), char const*&&, std::function<void ()>&&) (__f=<optimized out>) at /usr/include/c++/8/bits/move.h:182
#14 std::__invoke<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> >(void (*&&)(char const*, std::function<void ()>), char const*&&, std::function<void ()>&&) (__fn=<optimized out>) at /usr/include/c++/8/bits/invoke.h:95
#15 std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > >::_M_invoke<0ul, 1ul, 2ul>(std::_Index_tuple<0ul, 1ul, 2ul>) (this=<optimized out>) at /usr/include/c++/8/thread:244
#16 std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > >::operator()() (this=<optimized out>) at /usr/include/c++/8/thread:253
#17 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > > >::_M_run() (this=<optimized out>) at /usr/include/c++/8/thread:196
#18 0x00007fb94c1596df in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#19 0x00007fb94e44c6db in start_thread (arg=0x7fb8c687e700) at pthread_create.c:463
#20 0x00007fb94b81671f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 27 (Thread 0x7fb8c727f700 (LWP 86)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1 0x00007fb94e44f0f4 in __GI___pthread_mutex_lock (mutex=0x5635a27a3180 <cs_main>) at ../nptl/pthread_mutex_lock.c:115
#2 0x00005635a1b49d92 in __gthread_mutex_lock (__mutex=<optimized out>) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:748
#3 __gthread_recursive_mutex_lock (__mutex=<optimized out>) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:810...
通过日志可知,好几个线程都在__lll_lock_wait ()等待,统计得到发生死锁的线程id: 87,86,43,32. 看了一下这几个线程的代码,确实都有读取全局变量的代码。比如有两个全局变量A,B,分别对应两个锁。当线程87读取了变量A,准备读取变量B时,另外一个线程32已经读取了变量B,准备读A。这就形成了死锁的局面。
修改代码读取全局变量后需要马上释放锁,再去读取其他的全局变量。
插曲:在ubantu安装的pstack报错,Input/output error,网上找了教程,得到centos的pstack shell脚本
#!/bin/sh
if test $# -ne 1; then
echo "Usage: `basename $0 .sh` <process-id>" 1>&2
exit 1
fi
if test ! -r /proc/$1; then
echo "Process $1 not found." 1>&2
exit 1
fi
# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.
backtrace="bt"
if test -d /proc/$1/task ; then
# Newer kernel; has a task/ directory.
if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
backtrace="thread apply all bt"
fi
elif test -f /proc/$1/maps ; then
# Older kernel; go by it loading libpthread.
if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
backtrace="thread apply all bt"
fi
fi
GDB=${GDB:-gdb}
# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
#!/bin/sh
if test $# -ne 1; then
echo "Usage: `basename $0 .sh` <process-id>" 1>&2
exit 1
fi
if test ! -r /proc/$1; then
echo "Process $1 not found." 1>&2
exit 1
fi
# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.
backtrace="bt"
if test -d /proc/$1/task ; then
# Newer kernel; has a task/ directory.
if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
backtrace="thread apply all bt"
fi
elif test -f /proc/$1/maps ; then
# Older kernel; go by it loading libpthread.
if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
backtrace="thread apply all bt"
fi
fi
GDB=${GDB:-gdb}
# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
EOF
/bin/sed -n \
-e 's/^\((gdb) \)*//' \
-e '/^#/p' \
-e '/^Thread/p'
添加执行权限后,执行成功
本作品采用《CC 协议》,转载必须注明作者和本文链接