问题描述
客户报告我们一个API在查询到第11页时老是出现504Gateway Timout ERROR
问题排查
根据Code我们发现该API是一个mongo的SQL 语句
db['sxaimsubscriber'].find({'organizationId': '12615054', 'flowDiscovered': true, 'mappedBy': {'$regex': '.*DHCP.*'}, 'ipAddress': {'$exists': 1}, 'deletedTime': {'$exists': 0}},projection={'_id': 1, 'ipAddress': 1}).sort({'_id':1}).skip(30000).limit(3000)ionStats")
但奇怪的是,同样的SQL,在调用分页1~10页的时候都可以在5s内返回数据,但到第11页5分钟也不返回。
同时测试过,如果不分页,总查询数据量也就在5W左右,在2分钟之内也能返回数据。
根据官方文档https://docs.mongodb.com/manual/reference/method/cursor.skip/#cursor.skip ,skip可能会引发慢查询,于是我们分析查询计划
db['sxaimsubscriber'].find({'organizationId': '12615054', 'flowDiscovered': true, 'mappedBy': {'$regex': '.*DHCP.*'}, 'ipAddress': {'$exists': 1}, 'deletedTime': {'$exists': 0}},projection={'_id': 1, 'ipAddress': 1}).sort({'_id':1}).skip(10000).limit(3000).explain("executionStats")
当我们skip10000条的时候,执行计划中发现是走了Index的
"executionStats" : { "executionSuccess" : true, "nReturned" : 3000, "executionTimeMillis" : 17843, "totalKeysExamined" : 151864, "totalDocsExamined" : 53312, "executionStages" : { "stage" : "SKIP", "nReturned" : 3000, "executionTimeMillisEstimate" : 1136, "works" : 164866, "advanced" : 3000, "needTime" : 161865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "skipAmount" : 0, "inputStage" : { "stage" : "PROJECTION", "nReturned" : 13000, "executionTimeMillisEstimate" : 1106, "works" : 164866, "advanced" : 13000, "needTime" : 151865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "transformBy" : { "_id" : 1, "ipAddress" : 1 }, "inputStage" : { "stage" : "SORT", "nReturned" : 13000, "executionTimeMillisEstimate" : 1056, "works" : 164866, "advanced" : 13000, "needTime" : 151865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "sortPattern" : { "_id" : -1 }, "memUsage" : 13239001, "memLimit" : 33554432, "limitAmount" : 13000, "inputStage" : { "stage" : "KEEP_MUTATIONS", "nReturned" : 46369, "executionTimeMillisEstimate" : 806, "works" : 151865, "advanced" : 46369, "needTime" : 105495, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "inputStage" : { "stage" : "SORT_KEY_GENERATOR", "nReturned" : 0, "executionTimeMillisEstimate" : 796, "works" : 151865, "advanced" : 0, "needTime" : 105495, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "inputStage" : { "stage" : "FETCH", "filter" : { "$and" : [ { "flowDiscovered" : { "$eq" : true } }, { "ipAddress" : { "$exists" : true } }, { "$not" : { "deletedTime" : { "$exists" : true } } } ] }, "nReturned" : 46369, "executionTimeMillisEstimate" : 707, "works" : 151864, "advanced" : 46369, "needTime" : 105494, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "docsExamined" : 53312, "alreadyHasObj" : 0, "inputStage" : { "stage" : "IXSCAN", "filter" : { "mappedBy" : /.*DHCP.*/ }, "nReturned" : 53312, "executionTimeMillisEstimate" : 467, "works" : 151864, "advanced" : 53312, "needTime" : 98551, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "keyPattern" : { "organizationId" : 1, "mappedBy" : 1 }, "indexName" : "organizationId_mapped", "isMultiKey" : false, "isUnique" : false, "isSparse" : false, "isPartial" : false, "indexVersion" : 1, "direction" : "forward", "indexBounds" : { "organizationId" : [ "["12615054", "12615054"]" ], "mappedBy" : [ "["", {})", "[/.*DHCP.*/, /.*DHCP.*/]" ] }, "keysExamined" : 151864, "dupsTested" : 0, "dupsDropped" : 0, "seenInvalidated" : 0 } } } } } } } }
但我们skip30000条的时候,发现根本没有走Index
"executionStats" : { "executionSuccess" : true, "nReturned" : 3000, "executionTimeMillis" : 561807, "totalKeysExamined" : 6149055, "totalDocsExamined" : 6149055, "executionStages" : { "stage" : "LIMIT", "nReturned" : 3000, "executionTimeMillisEstimate" : 74140, "works" : 6290481, "advanced" : 3000, "needTime" : 6146055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 1, "invalidates" : 1219693, "limitAmount" : 3000, "inputStage" : { "stage" : "SKIP", "nReturned" : 3000, "executionTimeMillisEstimate" : 74000, "works" : 6290480, "advanced" : 3000, "needTime" : 6146055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "skipAmount" : 0, "inputStage" : { "stage" : "PROJECTION", "nReturned" : 33000, "executionTimeMillisEstimate" : 73630, "works" : 6290480, "advanced" : 33000, "needTime" : 6116055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "transformBy" : { "_id" : 1, "ipAddress" : 1 }, "inputStage" : { "stage" : "FETCH", "filter" : { "$and" : [ { "flowDiscovered" : { "$eq" : true } }, { "organizationId" : { "$eq" : "12615054" } }, { "mappedBy" : /.*DHCP.*/ }, { "ipAddress" : { "$exists" : true } }, { "$not" : { "deletedTime" : { "$exists" : true } } } ] }, "nReturned" : 33000, "executionTimeMillisEstimate" : 73188, "works" : 6290480, "advanced" : 33000, "needTime" : 6116055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "docsExamined" : 6149055, "alreadyHasObj" : 0, "inputStage" : { "stage" : "IXSCAN", "nReturned" : 6149055, "executionTimeMillisEstimate" : 4814, "works" : 6149055, "advanced" : 6149055, "needTime" : 0, "needYield" : 0, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "keyPattern" : { "_id" : 1 }, "indexName" : "_id_", "isMultiKey" : false, "isUnique" : true, "isSparse" : false, "isPartial" : false, "indexVersion" : 1, "direction" : "backward", "indexBounds" : { "_id" : [ "[MaxKey, MinKey]" ] }, "keysExamined" : 6149055, "dupsTested" : 0, "dupsDropped" : 0, "seenInvalidated" : 0 } } } } }
同时查看mongo日志,发现这个查询扫描了大量的rows:
2020-04-07T08:03:18.805-0700 I COMMAND [conn64846408] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { organizationId: 1.0, ipAddress: 1.0 }, IXSCAN { organizationId: 1.0, ipAddress: 1.0 } cursorid:537636864055 ntoreturn:3000 ntoskip:24000 keysExamined:151858 docsExamined:151858 hasSortStage:1 keyUpdates:0 writeConflicts:0 numYields:1398 nreturned:3000 reslen:242289 locks:{ Global: { acquireCount: { r: 2798 } }, MMAPV1Journal: { acquireCount: { r: 1405 }, acquireWaitCount: { r: 1 }, timeAcquiringMicros: { r: 7865 } }, Database: { acquireCount: { r: 1399 } }, Collection: { acquireCount: { R: 1399 }, acquireWaitCount: { R: 6 }, timeAcquiringMicros: { R: 5365 } } } 898ms 2020-04-07T08:03:20.366-0700 I COMMAND [conn64846413] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { organizationId: 1.0, ipAddress: 1.0 }, IXSCAN { organizationId: 1.0, ipAddress: 1.0 } cursorid:539455280546 ntoreturn:3000 ntoskip:27000 keysExamined:151858 docsExamined:151858 hasSortStage:1 keyUpdates:0 writeConflicts:0 numYields:1422 nreturned:3000 reslen:242313 locks:{ Global: { acquireCount: { r: 2846 } }, MMAPV1Journal: { acquireCount: { r: 1426 }, acquireWaitCount: { r: 1 }, timeAcquiringMicros: { r: 16191 } }, Database: { acquireCount: { r: 1423 } }, Collection: { acquireCount: { R: 1423 }, acquireWaitCount: { R: 3 }, timeAcquiringMicros: { R: 3438 } } } 891ms 2020-04-07T08:08:25.309-0700 I COMMAND [conn64846425] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { _id: 1 } cursorid:540082639314 ntoreturn:3000 ntoskip:30000 keysExamined:6158182 docsExamined:6158182 fromMultiPlanner:1 replanned:1 keyUpdates:0 writeConflicts:0 numYields:272958 nreturned:3000 reslen:242407 locks:{ Global: { acquireCount: { r: 545918 } }, MMAPV1Journal: { acquireCount: { r: 273683 }, acquireWaitCount: { r: 283 }, timeAcquiringMicros: { r: 3151680 } }, Database: { acquireCount: { r: 272959 } }, Collection: { acquireCount: { R: 272959 }, acquireWaitCount: { R: 724 }, timeAcquiringMicros: { R: 1861964 } } } 304275ms
同样,我们测试过,如果不加sort排序,在执行到第11页的时候也能快速返回,所以发现问题就在sort排序上。
根据官方文档
Add an index, or specify a smaller limit In MongoDB, sort operations can obtain the sort order by retrieving documents based on the ordering in an index.
If the query planner cannot obtain the sort order from an index, it will sort the results in memory.
Sort operations that use an index often have better performance than those that do not use an index.
In addition, sort operations that do not use an index will abort when they use 32 megabytes of memory.
因为我们skip3000条记录后,超过了系统默认的32MB内存排序,所以mongo重新使用了其他算法排序,出现大量扫描,导致查询非常慢。
解决方法
系统默认使用32MB 作为buffer
> use admin switched to db admin > db.runCommand( { getParameter : 1, "internalQueryExecMaxBlockingSortBytes" : 1 } ) { "internalQueryExecMaxBlockingSortBytes" : 33554432, "ok" : 1 }
提高buffer到50MB
> db.adminCommand({setParameter: 1, internalQueryExecMaxBlockingSortBytes:50151432}) { "was" : 33554432, "ok" : 1 }
同样也可以修改config配置文件
setParameter=internalQueryExecMaxBlockingSortBytes=309715200
改完后,再试API就能快速返回了。
参考文章:
https://zhuanlan.zhihu.com/p/61242164
https://askubuntu.com/questions/501937/how-to-increase-buffered-data-limit
https://stackoverflow.com/a/38408403/2400151
https://studio3t.com/knowledge-base/articles/mongodb-query-performance/
https://docs.mongodb.com/manual/core/index-compound/#index-ascending-and-descending