pstack排查死锁问题

下午修改好代码,本地测试运行一切正常,就打包编译上线了。晚上钉钉报警群突然出现警告日志。运维大佬也赶紧通知我说下午上的服务出现问题,程序好像在哪里卡住了。进程还在运行,但是数据一直解析不了。
查看了系统资源一切正常。无奈马上回滚旧版本继续解析数据,服务不能停啊,影响线上业务。
在dev环境跑起来,一切正常。查看了一下下午修改的地方,主要修改了读取一个全局变量的方式,应该是这个锁有问题。无奈c++用的比较少,检测死锁的工具在网上找了好多都没啥用。还好dev的进程重现了卡死的问题。记录到pid=30.
查看进程30包含的线程

$ ps -T -p 30
PID   SPID         TIME    CMD
30    30 ?        00:00:16 litecoind
   30    32 ?        00:00:03 b-JobWorker_Run
   30    33 ?        00:00:35 b-JobWorker_Run
   30    34 ?        00:00:35 b-JobWorker_Run
   30    35 ?        00:00:35 b-JobWorker_Run
   30    36 ?        00:00:35 b-JobWorker_Run
   30    37 ?        00:00:35 b-JobWorker_Run
   30    38 ?        00:00:35 b-JobWorker_Run
   30    39 ?        00:00:35 b-JobWorker_Run
   30    40 ?        00:00:00 b-scriptch.0
   30    41 ?        00:00:00 b-scriptch.1
   30    42 ?        00:00:00 b-scriptch.2
   30    43 ?        00:00:00 b-scheduler
   30    44 ?        00:00:00 b-http
   30    45 ?        00:00:00 b-httpworker.0
   30    46 ?        00:00:00 b-httpworker.1
   30    47 ?        00:00:00 b-httpworker.2
   30    48 ?        00:00:00 b-httpworker.3
   30    49 ?        00:00:00 b-httpworker.4
   30    50 ?        00:00:00 b-httpworker.5
   30    51 ?        00:00:00 b-httpworker.6
   30    52 ?        00:00:00 b-httpworker.7
   30    61 ?        00:00:00 litecoind
   30    82 ?        00:00:00 b-torcontrol
   30    83 ?        00:00:03 b-net
   30    85 ?        00:00:00 b-addcon
   30    86 ?        00:00:00 b-opencon
   30    87 ?        00:00:10 b-msghand
   30    94 ?        00:00:34 b-RunParser

psstack打印所有线程的堆栈信息,日志比较多,这里只显示线程94,87,86的堆栈日志

$ psstack 30
Thread 29 (Thread 0x7fb8c5e7d700 (LWP 94)):
#0  0x00007fb94e456d50 in __GI___nanosleep (requested_time=requested_time@entry=0x7fb8c5e79590, remaining=remaining@entry=0x7fb8c5e79590) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1  0x00005635a1d1099b in std::this_thread::sleep_for<long, std::ratio<1l, 1000l> > (__rtime=...) at /usr/include/c++/8/thread:379
#2  JobInstance::Join (this=0x7fb93c431900) at btcexplorer/Job.cpp:28
#3  0x00005635a1d125a6 in JobHandle::Join (this=0x7fb943848080) at btcexplorer/Job.h:112
#4  JobHandle::JoinAll(std::vector<JobHandle, std::allocator<JobHandle> >&) () at btcexplorer/Job.cpp:36
#5  0x00005635a1d86564 in BlockParserManager::ParseUnconfirm (this=this@entry=0x7fb8c5e7c500) at btcexplorer/ParserUnconfirm.cxx:439
#6  0x00005635a1d2f53e in BlockParserManager::DoParseUnconfirm (this=0x7fb8c5e7c500) at btcexplorer/ParserBlockParserManager.cxx:683
#7  RunParser() () at btcexplorer/Parser.cpp:115
#8  0x00007fb94c1596df in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#9  0x00007fb94e44c6db in start_thread (arg=0x7fb8c5e7d700) at pthread_create.c:463
#10 0x00007fb94b81671f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 28 (Thread 0x7fb8c687e700 (LWP 87)):
#0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1  0x00007fb94e44f0f4 in __GI___pthread_mutex_lock (mutex=0x7fb949d5f958) at ../nptl/pthread_mutex_lock.c:115
#2  0x00005635a1caee52 in __gthread_mutex_lock (__mutex=0x7fb8c687b9f8) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:748
#3  __gthread_recursive_mutex_lock (__mutex=0x7fb8c687b9f8) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:810
#4  std::recursive_mutex::lock (this=0x7fb8c687b9f8) at /usr/include/c++/8/mutex:107
#5  std::unique_lock<std::recursive_mutex>::lock (this=0x7fb8c687b9a0, this=0x7fb8c687b9a0) at /usr/include/c++/8/bits/std_mutex.h:267
#6  UniqueLock<AnnotatedMixin<std::recursive_mutex>, std::unique_lock<std::recursive_mutex> >::Enter (pszName=<optimized out>, pszFile=<optimized out>, nLine=<optimized out>, this=0x7fb8c687b9a0) at ./sync.h:143
#7  UniqueLock<AnnotatedMixin<std::recursive_mutex>, std::unique_lock<std::recursive_mutex> >::UniqueLock(AnnotatedMixin<std::recursive_mutex>&, char const*, char const*, int, bool) [clone .constprop.1448] () at ./sync.h:164
#8  0x00005635a1cb008c in CTxMemPool::GetMinFee(unsigned long) const () at txmempool.cpp:1145
#9  0x00005635a1b5a66d in PeerManager::SendMessages(CNode*) () at /usr/include/c++/8/ext/new_allocator.h:79
#10 0x00005635a1b2b3f5 in CConnman::ThreadMessageHandler() () at net.cpp:2242
#11 0x00005635a1b1f04e in std::function<void ()>::operator()() const (this=0x7fb8c687d710) at /usr/include/c++/8/bits/std_function.h:682
#12 TraceThread<std::function<void ()> >(char const*, std::function<void ()>) (name=<optimized out>, func=...) at ./util/system.h:438
#13 0x00005635a1b3da3f in std::__invoke_impl<void, void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> >(std::__invoke_other, void (*&&)(char const*, std::function<void ()>), char const*&&, std::function<void ()>&&) (__f=<optimized out>) at /usr/include/c++/8/bits/move.h:182
#14 std::__invoke<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> >(void (*&&)(char const*, std::function<void ()>), char const*&&, std::function<void ()>&&) (__fn=<optimized out>) at /usr/include/c++/8/bits/invoke.h:95
#15 std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > >::_M_invoke<0ul, 1ul, 2ul>(std::_Index_tuple<0ul, 1ul, 2ul>) (this=<optimized out>) at /usr/include/c++/8/thread:244
#16 std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > >::operator()() (this=<optimized out>) at /usr/include/c++/8/thread:253
#17 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(char const*, std::function<void ()>), char const*, std::function<void ()> > > >::_M_run() (this=<optimized out>) at /usr/include/c++/8/thread:196
#18 0x00007fb94c1596df in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#19 0x00007fb94e44c6db in start_thread (arg=0x7fb8c687e700) at pthread_create.c:463
#20 0x00007fb94b81671f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 27 (Thread 0x7fb8c727f700 (LWP 86)):
#0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1  0x00007fb94e44f0f4 in __GI___pthread_mutex_lock (mutex=0x5635a27a3180 <cs_main>) at ../nptl/pthread_mutex_lock.c:115
#2  0x00005635a1b49d92 in __gthread_mutex_lock (__mutex=<optimized out>) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:748
#3  __gthread_recursive_mutex_lock (__mutex=<optimized out>) at /usr/include/x86_64-linux-gnu/c++/8/bits/gthr-default.h:810...

通过日志可知,好几个线程都在__lll_lock_wait ()等待,统计得到发生死锁的线程id: 87,86,43,32. 看了一下这几个线程的代码,确实都有读取全局变量的代码。比如有两个全局变量A,B,分别对应两个锁。当线程87读取了变量A,准备读取变量B时,另外一个线程32已经读取了变量B,准备读A。这就形成了死锁的局面。
修改代码读取全局变量后需要马上释放锁,再去读取其他的全局变量。
插曲:在ubantu安装的pstack报错,Input/output error,网上找了教程,得到centos的pstack shell脚本

#!/bin/sh

if test $# -ne 1; then
    echo "Usage: `basename $0 .sh` <process-id>" 1>&2
    exit 1
fi

if test ! -r /proc/$1; then
    echo "Process $1 not found." 1>&2
    exit 1
fi

# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.

backtrace="bt"
if test -d /proc/$1/task ; then
    # Newer kernel; has a task/ directory.
    if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
    backtrace="thread apply all bt"
    fi
elif test -f /proc/$1/maps ; then
    # Older kernel; go by it loading libpthread.
    if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
    backtrace="thread apply all bt"
    fi
fi

GDB=${GDB:-gdb}

# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
#!/bin/sh

if test $# -ne 1; then
    echo "Usage: `basename $0 .sh` <process-id>" 1>&2
    exit 1
fi

if test ! -r /proc/$1; then
    echo "Process $1 not found." 1>&2
    exit 1
fi

# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.

backtrace="bt"
if test -d /proc/$1/task ; then
    # Newer kernel; has a task/ directory.
    if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
    backtrace="thread apply all bt"
    fi
elif test -f /proc/$1/maps ; then
    # Older kernel; go by it loading libpthread.
    if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
    backtrace="thread apply all bt"
    fi
fi

GDB=${GDB:-gdb}

# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
EOF
/bin/sed -n \
    -e 's/^\((gdb) \)*//' \
    -e '/^#/p' \
    -e '/^Thread/p'

添加执行权限后,执行成功

本作品采用《CC 协议》,转载必须注明作者和本文链接
用过哪些工具?为啥用这个工具(速度快,支持高并发...)?底层如何实现的?
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!