zoukankan      html  css  js  c++  java
  • Use Elasticksearch to solve TOP N issue

    The raw data is like

    timestamp,   router,    interface,   src_ip,    dst_ip,    protocol,   pkts
    10000000,    1.1.1.1    1            2.2.2.2    1.3.3.3    tcp         100
    10000000,    1.1.1.2    2            2.2.8.2    2.3.3.3    tcp         200
    10000001,    8.1.1.1    1            2.2.2.8    3.3.3.3    udp         500
    10000001,    2.1.1.1    1            2.2.8.2    4.3.3.8    udp         800
    

    I put these data into elastic search. Now I want to solve the problem:

    What is the top 3 combination of src_ip and dst_ip which send most pkts.

    Translate this requirements to SQL it would be:

        SELECT src_ip, dst_ip, sum(pkts) as total FROM raw_data GROUP BY src_ip, dst_ip ORDER BY total DESC LIMIT 3
    

    Before Every thing, below are the datas, I try to do group and aggregate

    {"TAG":10001,"SRC_MAC":"52:54:00:14:05:4a","DST_MAC":"52:54:00:2c:e4:7c","VLAN":1342,"COS":84,"IN_IFACE":2,"OUT_IFACE":2,"SRC_IP":"42.120.85.133","DST_IP":"42.120.83.164","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":13628,"DST_PORT":13783,"PROTOCOL":"ggp","PACKETS":11,"BYTES":4330,"time":1452643200}
    {"TAG":10002,"SRC_MAC":"52:54:00:5d:b5:10","DST_MAC":"52:54:00:18:d8:de","VLAN":543,"COS":66,"IN_IFACE":2,"OUT_IFACE":2,"SRC_IP":"42.120.83.123","DST_IP":"42.120.86.184","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":14731,"DST_PORT":14856,"PROTOCOL":"ip","PACKETS":6,"BYTES":3958,"time":1452643200}
    {"TAG":10002,"SRC_MAC":"52:54:00:50:14:e1","DST_MAC":"52:54:00:3f:50:38","VLAN":250,"COS":77,"IN_IFACE":2,"OUT_IFACE":3,"SRC_IP":"42.120.83.165","DST_IP":"42.120.86.172","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":11778,"DST_PORT":14673,"PROTOCOL":"isis","PACKETS":2,"BYTES":3803,"time":1452643200}
    {"TAG":10001,"SRC_MAC":"52:54:00:17:5e:e3","DST_MAC":"52:54:00:75:da:af","VLAN":2647,"COS":2,"IN_IFACE":2,"OUT_IFACE":1,"SRC_IP":"42.120.86.253","DST_IP":"42.120.83.58","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":16767,"DST_PORT":16418,"PROTOCOL":"ipv6-route","PACKETS":10,"BYTES":2852,"time":1452643200}
    {"TAG":10002,"SRC_MAC":"52:54:00:4a:e6:49","DST_MAC":"52:54:00:37:f2:78","VLAN":1005,"COS":88,"IN_IFACE":3,"OUT_IFACE":2,"SRC_IP":"42.120.81.90","DST_IP":"42.120.81.248","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":11573,"DST_PORT":16757,"PROTOCOL":"encap","PACKETS":7,"BYTES":1745,"time":1452643200}
    {"TAG":10001,"SRC_MAC":"52:54:00:52:1e:29","DST_MAC":"52:54:00:6a:2b:0e","VLAN":1816,"COS":26,"IN_IFACE":2,"OUT_IFACE":3,"SRC_IP":"42.120.82.91","DST_IP":"42.120.85.121","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":15961,"DST_PORT":15761,"PROTOCOL":"gre","PACKETS":16,"BYTES":2753,"time":1452643200}
    {"TAG":10000,"SRC_MAC":"52:54:00:3d:16:89","DST_MAC":"52:54:00:33:b8:54","VLAN":3393,"COS":64,"IN_IFACE":4,"OUT_IFACE":2,"SRC_IP":"42.120.86.27","DST_IP":"42.120.85.184","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":18677,"DST_PORT":17202,"PROTOCOL":"eigrp","PACKETS":6,"BYTES":3474,"time":1452643200}
    {"TAG":10000,"SRC_MAC":"52:54:00:01:bb:4c","DST_MAC":"52:54:00:21:91:c0","VLAN":3803,"COS":23,"IN_IFACE":1,"OUT_IFACE":2,"SRC_IP":"42.120.85.186","DST_IP":"42.120.82.206","SRC_MASK":24,"DST_MASK":24,"SRC_PORT":15093,"DST_PORT":18784,"PROTOCOL":"ospf","PACKETS":20,"BYTES":3171,"time":1452643200}     
    

    The mapping

    	curl -XGET localhost:9200/sflow/_mapping | json_reformat 
    
    	{
    		"sflow": {
    			"mappings": {
    				"9k": {
    					"properties": {
    						"BYTES": {
    							"type": "long"
    						},
    						"COS": {
    							"type": "long"
    						},
    						"DST_IP": {
    							"type": "string"
    						},
    						"DST_MAC": {
    							"type": "string"
    						},
    						"DST_MASK": {
    							"type": "long"
    						},
    						"DST_PORT": {
    							"type": "long"
    						},
    						"IN_IFACE": {
    							"type": "long"
    						},
    						"OUT_IFACE": {
    							"type": "long"
    						},
    						"PACKETS": {
    							"type": "long"
    						},
    						"PROTOCOL": {
    							"type": "string"
    						},
    						"SRC_IP": {
    							"type": "string"
    						},
    						"SRC_MAC": {
    							"type": "string"
    						},
    						"SRC_MASK": {
    							"type": "long"
    						},
    						"SRC_PORT": {
    							"type": "long"
    						},
    						"TAG": {
    							"type": "long"
    						},
    						"VLAN": {
    							"type": "long"
    						},
    						"time": {
    							"type": "long"
    						}
    					}
    				}
    			}
    		}
    	}
    
    
    

    The agg query

    curl -XPOST 'localhost:9200/sflow/_search?pretty' -d '
    {
      "query": { "match_all": {} },
    	"aggs":
    		{
    			"by_SRC_IP": 
    				{
    					"terms": {"script": "[doc.SRC_IP.value, doc.DST_IP.value].join('-')","size": 3, "order": {"sum_bits": "desc"}},
    					"aggs": { "sum_bits": { "sum": {"field": "BYTES"} } }
    				}
    		}
    }'
    

    The output

    {
      "error" : {
        "root_cause" : [ {
          "type" : "groovy_script_compilation_exception",
          "reason" : "failed to compile groovy script"
        } ],
        "type" : "search_phase_execution_exception",
        "reason" : "all shards failed",
        "phase" : "query",
        "grouped" : true,
        "failed_shards" : [ {
          "shard" : 0,
          "index" : "sflow",
          "node" : "N0xpH1hbQG-V9DU-ANfVgw",
          "reason" : {
            "type" : "script_exception",
            "reason" : "Failed to compile inline script [[doc.SRC_IP.value, doc.DST_IP.value].join(-)] using lang [groovy]",
            "caused_by" : {
              "type" : "groovy_script_compilation_exception",
              "reason" : "failed to compile groovy script",
              "caused_by" : {
                "type" : "multiple_compilation_errors_exception",
                "reason" : "startup failed:
    c6d3ea304aad32f8cc4f0efa6b5540cf0ad6be1a: 1: unexpected token: ) @ line 1, column 44.
       alue, doc.DST_IP.value].join(-)
                                     ^
    
    1 error
    "
              }
            }
          }
        } ]
      },
      "status" : 500
    }
    
  • 相关阅读:
    HADOOP高可用机制
    HDFS详解
    HBase详解
    大数据计算
    Flume+Sqoop+Azkaban笔记
    Kafka知识总结
    Kafka集群安装部署、Kafka生产者、Kafka消费者
    Hive详解
    Spark面试相关
    HDFS常用操作命令
  • 原文地址:https://www.cnblogs.com/kramer/p/5140046.html
Copyright © 2011-2022 走看看