zoukankan      html  css  js  c++  java
  • MongoDB副本集replica set(五)--故障排查

    (一)查看副本集状态
    要查看副本集状态及每个成员的状态,可以连接到主节点,使用 rs.status() 命令查看:

    rstest:PRIMARY> rs.status()
    {
    "set" : "rstest",
    "date" : ISODate("2020-06-30T15:06:44.586Z"),
    "myState" : 1,
    "term" : NumberLong(15),
    "syncingTo" : "",
    "syncSourceHost" : "",
    "syncSourceId" : -1,
    "heartbeatIntervalMillis" : NumberLong(2000),
    "majorityVoteCount" : 2,
    "writeMajorityCount" : 2,
    "optimes" : {
    "lastCommittedOpTime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "lastCommittedWallTime" : ISODate("2020-06-30T15:06:38.972Z"),
    "readConcernMajorityOpTime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "readConcernMajorityWallTime" : ISODate("2020-06-30T15:06:38.972Z"),
    "appliedOpTime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "durableOpTime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "lastAppliedWallTime" : ISODate("2020-06-30T15:06:38.972Z"),
    "lastDurableWallTime" : ISODate("2020-06-30T15:06:38.972Z")
    },
    "lastStableRecoveryTimestamp" : Timestamp(1593529578, 1),
    "lastStableCheckpointTimestamp" : Timestamp(1593529578, 1),
    "electionCandidateMetrics" : {
    "lastElectionReason" : "priorityTakeover",
    "lastElectionDate" : ISODate("2020-06-30T11:42:47.656Z"),
    "electionTerm" : NumberLong(15),
    "lastCommittedOpTimeAtElection" : {
    "ts" : Timestamp(1593517366, 1),
    "t" : NumberLong(14)
    },
    "lastSeenOpTimeAtElection" : {
    "ts" : Timestamp(1593517366, 1),
    "t" : NumberLong(14)
    },
    "numVotesNeeded" : 2,
    "priorityAtElection" : 3,
    "electionTimeoutMillis" : NumberLong(10000),
    "priorPrimaryMemberId" : 2,
    "numCatchUpOps" : NumberLong(0),
    "newTermStartDate" : ISODate("2020-06-30T11:42:47.731Z"),
    "wMajorityWriteAvailabilityDate" : ISODate("2020-06-30T11:42:48.789Z")
    },
    "electionParticipantMetrics" : {
    "votedForCandidate" : true,
    "electionTerm" : NumberLong(14),
    "lastVoteDate" : ISODate("2020-06-30T11:42:36.235Z"),
    "electionCandidateMemberId" : 2,
    "voteReason" : "",
    "lastAppliedOpTimeAtElection" : {
    "ts" : Timestamp(1593364745, 1),
    "t" : NumberLong(12)
    },
    "maxAppliedOpTimeInSet" : {
    "ts" : Timestamp(1593364745, 1),
    "t" : NumberLong(12)
    },
    "priorityAtElection" : 3
    },
    "members" : [
    {
    "_id" : 0,
    "name" : "192.168.10.41:27017",
    "health" : 1,
    "state" : 1,
    "stateStr" : "PRIMARY",
    "uptime" : 12262,
    "optime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "optimeDate" : ISODate("2020-06-30T15:06:38Z"),
    "syncingTo" : "",
    "syncSourceHost" : "",
    "syncSourceId" : -1,
    "infoMessage" : "",
    "electionTime" : Timestamp(1593517367, 1),
    "electionDate" : ISODate("2020-06-30T11:42:47Z"),
    "configVersion" : 12,
    "self" : true,
    "lastHeartbeatMessage" : ""
    },
    {
    "_id" : 2,
    "name" : "192.168.10.43:27017",
    "health" : 1,
    "state" : 2,
    "stateStr" : "SECONDARY",
    "uptime" : 12258,
    "optime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "optimeDurable" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "optimeDate" : ISODate("2020-06-30T15:06:38Z"),
    "optimeDurableDate" : ISODate("2020-06-30T15:06:38Z"),
    "lastHeartbeat" : ISODate("2020-06-30T15:06:43.724Z"),
    "lastHeartbeatRecv" : ISODate("2020-06-30T15:06:43.051Z"),
    "pingMs" : NumberLong(0),
    "lastHeartbeatMessage" : "",
    "syncingTo" : "192.168.10.41:27017",
    "syncSourceHost" : "192.168.10.41:27017",
    "syncSourceId" : 0,
    "infoMessage" : "",
    "configVersion" : 12
    },
    {
    "_id" : 3,
    "name" : "192.168.10.42:27017",
    "health" : 1,
    "state" : 2,
    "stateStr" : "SECONDARY",
    "uptime" : 12258,
    "optime" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "optimeDurable" : {
    "ts" : Timestamp(1593529598, 1),
    "t" : NumberLong(15)
    },
    "optimeDate" : ISODate("2020-06-30T15:06:38Z"),
    "optimeDurableDate" : ISODate("2020-06-30T15:06:38Z"),
    "lastHeartbeat" : ISODate("2020-06-30T15:06:43.724Z"),
    "lastHeartbeatRecv" : ISODate("2020-06-30T15:06:43.051Z"),
    "pingMs" : NumberLong(0),
    "lastHeartbeatMessage" : "",
    "syncingTo" : "192.168.10.43:27017",
    "syncSourceHost" : "192.168.10.43:27017",
    "syncSourceId" : 2,
    "infoMessage" : "",
    "configVersion" : 12
    }
    ],
    "ok" : 1,
    "$clusterTime" : {
    "clusterTime" : Timestamp(1593529598, 1),
    "signature" : {
    "hash" : BinData(0,"JcYryJc0DU0GgX0mCKS1D86r30Y="),
    "keyId" : NumberLong("6841443127941660675")
    }
    },
    "operationTime" : Timestamp(1593529598, 1)
    }
    View Code

    (二)检查复制滞后
    复制滞后是主节点上的操作与从节点的操作之间的延迟,复制滞后会导致读取数据不一致,次级节点跟上oplog窗口导致重新同步数据等。
    要检查复制滞后,可以在主节点使用 rs.printSlaveReplicationInfo() 方法,结果如下:

    rstest:PRIMARY> rs.printSlaveReplicationInfo()
    source: 192.168.10.43:27017
    syncedTo: Tue Jun 30 2020 23:14:59 GMT+0800 (CST)
    0 secs (0 hrs) behind the primary 
    source: 192.168.10.42:27017
    syncedTo: Tue Jun 30 2020 23:14:59 GMT+0800 (CST)
    0 secs (0 hrs) behind the primary

    滞后的原因:
    1.网络延迟。检查副本集成员之间的网络路由,确保没有数据丢包或网络路由问题。可以使用ping或traceroute来确认。
    2.磁盘吞吐量。使用iostat或vmstat评估磁盘状态。
    3.大事务操作。主节点上长时间运行的操作可能会阻塞次级节点的复制。
    4.合理的写关心(write concern)。如果需要对主数据库进行大量写入,执行大批量数据加载操作,则辅助数据库无法跟上主节点。

    (三)测试所有成员之间的连接
    副本集的所有成员必须能够连接到该集合的每一个其它成员以支持复制。如果节点复制异常,可以排查节点之间的连通性。
    如存在3个节点的副本集:
    m1.example.net
    m2.example.net
    m3.example.net

    在节点m1.example.net上测试能否正常连接到其它2个节点

    mongo --host m2.example.net --port 27017
    mongo --host m3.example.net --port 27017 

    在节点m2.example.net上测试能否正常连接到其它2个节点

    mongo --host m1.example.net --port 27017
    mongo --host m3.example.net --port 27017

    在节点m3.example.net上测试能否正常连接到其它2个节点

    mongo --host m1.example.net --port 27017
    mongo --host m2.example.net --port 27017

    (四)检查oplog的大小
    较大的oplog可以使副本集对延迟的容忍度更高,要检查oplog大小,请在当前节点执行 rs.printReplicationInfo() 方法。

    rstest:SECONDARY> rs.printReplicationInfo()
    configured oplog size: 2115.693359375MB
    log length start to end: 634163secs (176.16hrs)
    oplog first event time: Tue Jun 23 2020 15:33:16 GMT+0800 (CST)
    oplog last event time: Tue Jun 30 2020 23:42:39 GMT+0800 (CST)
    now: Tue Jun 30 2020 23:42:50 GMT+0800 (CST)
    rstest:SECONDARY> 

    操作日志应该足够大,至少能够保存24小时的日志。

    【完】

  • 相关阅读:
    最易理解的傅里叶分析讲解
    python获取最大、最小值
    机器学习周志华——学习器性能度量
    机器学习周志华——模型评估与选择
    机器学习周志华——机器学习重要会议及期刊
    机器学习周志华——机器学习的应用领域
    机器学习周志华——机器学习的发展
    在github上保存vscode的配置(后续重新安装vscode时,可以十分方便地从github上下载安装这个保存的配置)
    windows 10 macbook air 无声音
    监控 4MM 6MM 8MM 选择
  • 原文地址:https://www.cnblogs.com/lijiaman/p/13215533.html
Copyright © 2011-2022 走看看