PostgreSQL同步复制主库挂起分析
发表于:2025-11-12 作者:千家信息网编辑
千家信息网最后更新 2025年11月12日,这篇文章主要讲解了"PostgreSQL同步复制主库挂起分析",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"PostgreSQL同步复制主库挂起分析"吧
千家信息网最后更新 2025年11月12日PostgreSQL同步复制主库挂起分析
这篇文章主要讲解了"PostgreSQL同步复制主库挂起分析",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"PostgreSQL同步复制主库挂起分析"吧!
在Streaming Replication环境中PostgreSQL主节点设置为同步复制,如standby节点没有启动或者网络出现问题没法连接到主节点时,主节点如执行DML则进程会挂起,下面分析这个挂起的问题.
一、数据结构
Latch
Latch结构体应被视为opaque"不透明的",并且只能通过公共的函数访问.在这里定义是运行把Latchs作为更大的结构体的一部分.
//通常情况下,int类型的变量通常是原子访问的,也可以认为 sig_atomic_t就是int类型的数据,//因为对这些变量要求一条指令完成,所以sig_atomic_t不可能是结构体,只会是数字类型。typedef int __sig_atomic_t;/* * Latch structure should be treated as opaque and only accessed through * the public functions. It is defined here to allow embedding Latches as * part of bigger structs. * Latch结构体应被视为"不透明的"opaque,并且只能通过公共的函数访问. * 在这里定义是运行把Latchs作为更大的结构体的一部分. */typedef struct Latch{ sig_atomic_t is_set; bool is_shared; int owner_pid;#ifdef WIN32 HANDLE event;#endif} Latch;二、源码解读
N/A
二、跟踪分析
启动master节点,不启动standby节点,使用psql连接数据库,执行SQL,Session挂起:
testdb=# drop table t1;
使用gdb跟踪挂起的进程
[xdb@localhost ~]$ ps -ef|grep postgresxdb 1318 1 0 12:14 pts/0 00:00:00 /appdb/xdb/pg11.2/bin/postgresxdb 1319 1318 0 12:14 ? 00:00:00 postgres: logger xdb 1321 1318 0 12:14 ? 00:00:00 postgres: checkpointer xdb 1322 1318 0 12:14 ? 00:00:00 postgres: background writer xdb 1323 1318 0 12:14 ? 00:00:00 postgres: walwriter xdb 1324 1318 0 12:14 ? 00:00:00 postgres: autovacuum launcher xdb 1325 1318 0 12:14 ? 00:00:00 postgres: archiver xdb 1326 1318 0 12:14 ? 00:00:00 postgres: stats collector xdb 1327 1318 0 12:14 ? 00:00:00 postgres: logical replication launcher xdb 1331 1318 0 12:15 ? 00:00:00 postgres: xdb testdb [local] DROP TABLE waiting for 0/5D07B668[xdb@localhost ~]$ gdb -p 1331GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7...
查看调用栈
(gdb) bt#0 0x00007f4636d48903 in __epoll_wait_nocancel () from /lib64/libc.so.6#1 0x000000000088e668 in WaitEventSetWaitBlock (set=0x21640e8, cur_timeout=-1, occurred_events=0x7ffc96572f40, nevents=1) at latch.c:1048#2 0x000000000088e543 in WaitEventSetWait (set=0x21640e8, timeout=-1, occurred_events=0x7ffc96572f40, nevents=1, wait_event_info=134217761) at latch.c:1000#3 0x000000000088dcec in WaitLatchOrSocket (latch=0x7f462d5b44d4, wakeEvents=17, sock=-1, timeout=-1, wait_event_info=134217761) at latch.c:385#4 0x000000000088dbcd in WaitLatch (latch=0x7f462d5b44d4, wakeEvents=17, timeout=-1, wait_event_info=134217761) at latch.c:339#5 0x0000000000863e2d in SyncRepWaitForLSN (lsn=1560786536, commit=true) at syncrep.c:286#6 0x0000000000546279 in RecordTransactionCommit () at xact.c:1359#7 0x0000000000546da3 in CommitTransaction () at xact.c:2074#8 0x0000000000547a3f in CommitTransactionCommand () at xact.c:2817#9 0x00000000008be250 in finish_xact_command () at postgres.c:2523#10 0x00000000008bbf45 in exec_simple_query (query_string=0x20a1d78 "drop table t1;") at postgres.c:1170#11 0x00000000008c0191 in PostgresMain (argc=1, argv=0x20cdcd8, dbname=0x20cdb40 "testdb", username=0x209ea98 "xdb") at postgres.c:4182#12 0x000000000081e06c in BackendRun (port=0x20c3b10) at postmaster.c:4361#13 0x000000000081d7df in BackendStartup (port=0x20c3b10) at postmaster.c:4033#14 0x0000000000819bd9 in ServerLoop () at postmaster.c:1706#15 0x000000000081948f in PostmasterMain (argc=1, argv=0x209ca50) at postmaster.c:1379#16 0x0000000000742931 in main (argc=1, argv=0x209ca50) at main.c:228(gdb)
kill进程,重新进入在WaitLatch上设置断点进行跟踪
#########[xdb@localhost ~]$ kill -9 1331#########testdb=# select pg_backend_pid(); pg_backend_pid ---------------- 1377(1 row)#########[xdb@localhost ~]$ gdb -p 1377...(gdb) b WaitLatchBreakpoint 1 at 0x88dbac: file latch.c, line 339.(gdb) #########testdb=# drop table t1;ERROR: table "t1" does not existtestdb=# create table t1(id int);
进入断点
(gdb) b WaitLatchBreakpoint 1 at 0x88dbac: file latch.c, line 339.(gdb) cContinuing.Breakpoint 1, WaitLatch (latch=0x7f462d5b44d4, wakeEvents=17, timeout=-1, wait_event_info=134217761) at latch.c:339339 return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,(gdb)
进入WaitLatchOrSocket
(gdb) stepWaitLatchOrSocket (latch=0x7f462d5b44d4, wakeEvents=17, sock=-1, timeout=-1, wait_event_info=134217761) at latch.c:359359 int ret = 0;(gdb) (gdb) p *latch$1 = {is_set = 0, is_shared = true, owner_pid = 1377}构建等待事件集
(gdb) n362 WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);(gdb) n364 if (wakeEvents & WL_TIMEOUT)(gdb) 367 timeout = -1;(gdb) 369 if (wakeEvents & WL_LATCH_SET)(gdb) p *set$2 = {nevents = 0, nevents_space = 3, events = 0x2181eb8, latch = 0x0, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *set->events$3 = {pos = 0, events = 0, fd = 0, user_data = 0x0}(gdb) p *set->epoll_ret_events$4 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb) $5 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb) n370 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,(gdb) 373 if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)(gdb) 374 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,(gdb) 377 if (wakeEvents & WL_SOCKET_MASK)(gdb) 385 rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);(gdb) p *set$6 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *set->events$7 = {pos = 0, events = 1, fd = 11, user_data = 0x0}(gdb) p *set->epoll_ret_events$8 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb)进入WaitEventSetWait
(gdb) stepWaitEventSetWait (set=0x2181e90, timeout=-1, occurred_events=0x7ffc96572f40, nevents=1, wait_event_info=134217761) at latch.c:925925 int returned_events = 0;(gdb)
输入参数
(gdb) n928 long cur_timeout = -1;(gdb) p *set$9 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *occurred_events$10 = {pos = 35135068, events = 0, fd = -1772664741, user_data = 0x7ffc96572fa0}(gdb)执行相关判断和设置参数
(gdb) n930 Assert(nevents > 0);(gdb) 936 if (timeout >= 0)(gdb) 943 pgstat_report_wait_start(wait_event_info);(gdb) 946 waiting = true;(gdb)
未有事件出现,则循环
951 while (returned_events == 0)(gdb)
不符合set->latch->is_set为T的条件,继续循环
982 if (set->latch && set->latch->is_set)(gdb) p *set->latch$11 = {is_set = 0, is_shared = true, owner_pid = 1377}(gdb)进入WaitEventSetWaitBlock
(gdb) n1000 rc = WaitEventSetWaitBlock(set, cur_timeout,(gdb) stepWaitEventSetWaitBlock (set=0x2181e90, cur_timeout=-1, occurred_events=0x7ffc96572f40, nevents=1) at latch.c:10421042 int returned_events = 0;(gdb)
调用epoll_wait,挂起
(gdb) n1048 rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,(gdb) p *set$12 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) (gdb) n启动standby节点
####[xdb@localhost ~]$ pg_ctl startpg_ctl: another server might be running; trying to start server anyway...
接收到信号
Program received signal SIGUSR1, User defined signal 1.0x00007f4636d48903 in __epoll_wait_nocancel () from /lib64/libc.so.6(gdb) (gdb) nSingle stepping until exit from function __epoll_wait_nocancel,which has no line number information.procsignal_sigusr1_handler (postgres_signal_arg=-1) at procsignal.c:262262 {(gdb)感谢各位的阅读,以上就是"PostgreSQL同步复制主库挂起分析"的内容了,经过本文的学习后,相信大家对PostgreSQL同步复制主库挂起分析这一问题有了更深刻的体会,具体使用情况还需要大家实践验证。这里是,小编将为大家推送更多相关知识点的文章,欢迎关注!
节点
分析
结构
同步
数据
类型
进程
问题
学习
跟踪
事件
内容
函数
参数
变量
就是
情况
断点
循环
运行
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
太仓运营网络技术咨询热线
软件开发包收费标准
ass 数据库
贵州山区里的云服务器云空间
西藏英文服务器租用
甘肃省网络安全管理中心
服务器不能识别网卡
阜阳雅城互联网科技有限公司
wind数据库使用案例
plsql 数据库为空
网络安全推送信息
网络安全运营持续有效
新时代网络技术工业化
数据大批量导入数据库
服务器管理容易吗
苏州刀片服务器厂家直销
软件开发公司一年利润多少
ieee无线网络技术
数据库插入数据保存后搜不到
奉贤区品质软件开发网上价格
门锁管理服务器
c数据库中的自定义按钮在哪
绍兴盛推网络技术公司
西藏英文服务器租用
甘肃省网络安全管理中心
cae软件开发兼职
易语言服务器管理员权限
网络安全警察考取
三明财务软件开发哪家强
网络安全共有多少条目