zoukankan      html  css  js  c++  java
  • ELK基础原理

    搜索引擎 

        索引组件:  获取数据-->建立文档-->文档分析-->文档索引(倒排索引)
        搜索组件:  用户搜索接口-->建立查询(将用户键入的信息转换为可处理的查询对象)-->搜索查询-->展现结果

        索引组件:Lucene 核心组件

            索引(index):数据库(database)
            类型(type):表(table)
            文档(Document):行(row)
            映射(Mapping):

            Lucene只负责文档分析  不负责获取数据和建立文档  必须借助其它工具建立文档后才能发挥Lucene的作用

            文档分析最重要的就是切词 把整个文档切分成一个一个单词

        搜索组件:

            Solr                        基于单机运行

            ElasticSearch        基于分布式运行(弹性搜索引擎)   分散的运行到多个节点

       一个搜索引擎是由两个部分组成:

          1.search 搜索组件

              面向用户的接口   接入用户的请求  把用户的请求转换成适合搜索算法执行搜索的形式  把搜索结果返回给用户

          2.index   索引组件

              分析原始数据  改造原始数据 把原始数据结构变成适合搜索算法搜索的结构

          3.倒排索引的实现      

             1.首先把原始数据构建成文档
                1  winter is coming
                2  our is the big
                3  the pig is big

            2.把文档创建出倒排索引
              term     freg    documents
              winter   1          1
              big       2           2,3
               is       3          1,2,3
              our      1             2
          通过hash算法在倒排索引中把包含关键字的文档编号返回给客户端

        ELK的两种使用场景:

           1.整站的日志存储分析           2.全站搜索

       ELK和Hadoop的区别:

           Hadoop 只能实现离线计算
           文件系统        HDFS
           数据存储        HBase
           分布式计算    MapReduce

    Elasticsearch安装和配置    

      修改相关配置
        1.修改jvm初始化内存分配大小 /etc/elasticsearch/jvm.options
        2.主配置文件段 /etc/elasticsearch/elasticsearch.yml


        Cluster配置段   标识某个节点是否属于当前集群的成员
        Node配置段      集群中当前节点的唯一标识
        Paths配置段     设置日志和数据的存放路径
        Memory配置段        内存管理设置
        Network配置段        网络接口的设置
        Discovery配置段     成员关系判定的相关协议
        Gateway配置段       网关设置
        Various配置段        其他可变参数设置

      测试安装成功
       [root@wi]# curl -XGET http://192.168.74.128:9200
      {
         "name" : "192.168.74.128",
         "cluster_name" : "myels",
         "cluster_uuid" : "Qq0ms0ncQle85Wm27STTHg",
         "version" : {
            "number" : "5.6.10",
            "build_hash" : "b727a60",
           "build_date" : "2018-06-06T15:48:34.860Z",
           "build_snapshot" : false,
           "lucene_version" : "6.6.1"
          },
           "tagline" : "You Know, for Search"
      }
       创建索引
       [root@192 logs]# curl -XPUT http://192.168.74.128:9200/myindex
       {"acknowledged":true,"shards_acknowledged":true,"index":"myindex"}
       查看索引的分片信息
       [root@192 logs]# curl -XGET http://192.168.74.128:9200/_cat/shards
        myindex  4 p STARTED 0 162b 192.168.74.128 192.168.74.128
        myindex  4 r STARTED 0 162b 192.168.74.129 192.168.74.129
        myindex  1 r STARTED 0 162b 192.168.74.128 192.168.74.128
        myindex  1 p STARTED 0 162b 192.168.74.129 192.168.74.129
        myindex  3 r STARTED 0 162b 192.168.74.128 192.168.74.128
        myindex  3 p STARTED 0 162b 192.168.74.129 192.168.74.129
        myindex  2 p STARTED 0 162b 192.168.74.128 192.168.74.128
        myindex  2 r STARTED 0 162b 192.168.74.129 192.168.74.129
        myindex  0 p STARTED 0 162b 192.168.74.128 192.168.74.128
        myindex  0 r STARTED 0 162b 192.168.74.129 192.168.74.129

    Logstash安装和配置

         集中,转发并存储数据 高度插件化
         1. 数据输入插件(日志,redis)
         2. 数据过滤插件
         3. 数据输出插件
         logstash既可以做agent从本地收集数据信息 把数据文档化输出到elasticsearch  

         logstash也可以做server收集各个logstash agent收集的数据并对agent提交的数据统一做格式化,文档化再发送给easticsearch

         logstash安装的默认目录在/usr/share/logstash中 此目录并没有在系统环境变量中 启动服务的时候需要指明绝对路径   

         ip地址数据库    maxmind geolite2

    [root@192 bin]# ./logstash -f /etc/logstash/conf.d/test1.conf
    jjjj
    {
          "@version" => "1",
              "host" => "192.168.1.4",
        "@timestamp" => 2018-08-19T08:32:19.449Z,
           "message" => "jjjj"
    }
    
    logstash配置文件格式
     input{ } filter { } output{ }
    logstash 内建pattern
    less /usr/share/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-4.1.2/patterns/
    
    input {
       file{
           start_position => "end"
           path => ["/var/log/httpd/access_log"]
       }
    }
    
    filter {
        grok {
          match => {"message" => "%{IP:client}"  }
        }
    }
    
    output {
      stdout {
        codec => rubydebug
      }
    }
    
    filter {
        logstash内建很多插件模块
        grok {
          match => {"message" => "%{HTTPD_COMBINEDLOG}"  }
        }
    }
    配置文件基础框架
    input {
       file{
           start_position => "end"
           path => ["/var/log/httpd/access_log"]
       }
    }
    
    filter {
        grok {
          match => {"message" => "%{HTTPD_COMBINEDLOG}"  }
          remove_field => "message"
        }
       date {
          match => ["timestamp","dd/MMM/YYYY:H:m:s Z"]
          remove_field => "timestamp"
       }
       geoip {
             source   => "clientip"
             target   => "geoip"
             database => "/etc/logstash/geoip/GeoLite2-City.mmdb"
      }
    }
    
    output {
      elasticsearch {
          hosts => ["http://192.168.74.128:9200","http://192.168.74.129:9200"]
          index => "logstash-%{+YYYY.MM.dd}"
          document_type => "apache_logs"
      }
    }
    logstash收集文件
    input {
       beats {
           port => 5044
       }
    }
    
    filter {
      grok {
           match => { "message" => "%{HTTPD_COMBINEDLOG}" }
           remove_field => "message"
      }
     date {
         match => ["timestamp","dd/MMM/YYYY:H:m:s Z"]
         remove_field => "timestamp"
     }
     geoip {
        source  => "clientip"
        target  => "geoip"
        database => "/etc/logstash/geoip/GeoLite2-City.mmdb"
     }
    }
    
    output {
      elasticsearch {
          hosts => ["http://192.168.74.128:9200/","http://192.168.74.129:9200/"]
          index => "logstash-%{+YYYY.MM.dd}-33"
          document_type => "apache_logs"
      }
    }
    logstash收集filebeats数据
    input {
      redis {
         data_type => "list"
         db => 0
         host => "192.168.74.129"
         port => 6379
         key => "filebeat"
         password => "food"
      }
    }
    
    filter {
      grok {
           match => { "message" => "%{HTTPD_COMBINEDLOG}" }
           remove_field => "message"
      }
     date {
         match => ["timestamp","dd/MMM/YYYY:H:m:s Z"]
         remove_field => "timestamp"
     }
     geoip {
        source  => "clientip"
        target  => "geoip"
        database => "/etc/logstash/geoip/GeoLite2-City.mmdb"
     }
    }
    
    output {
      elasticsearch {
          hosts => ["http://192.168.74.128:9200/","http://192.168.74.129:9200/"]
          index => "logstash-%{+YYYY.MM.dd}"
          document_type => "apache_logs"
      }
    }
    logstash读取redis

     logstashserver 配置文件支持if条件判断设置

    filter {
        if [path]  =~ "access" {
          grok {
             match => {"message" => "%{IP:client}"  }
          }
        }
        if [geo][city] = "bj" {
        
        }
    }
    if条件判断设置

    FileBeat安装和配置

    filebeat支持的所有插件实例文件存放在: /etc/filebeat/filebeat.full.yml 

    #-------------------------- Elasticsearch output ------------------------------
    #output.elasticsearch:
      # Array of hosts to connect to.
      #hosts: ["192.168.74.128:9200","192.168.74.129:9200"]
    
      # Optional protocol and basic auth credentials.
      #protocol: "https"
      #username: "elastic"
      #password: "changeme"
    
    
    #----------------------------- Logstash output --------------------------------
    #output.logstash:
      # The Logstash hosts
      #hosts: ["192.168.74.128:5044"]
    
      # Optional SSL. By default is off.
      # List of root certificates for HTTPS server verifications
      #ssl.certificate_authorities: ["/etc/pki/root/ca.pem"]
    
      # Certificate for SSL client authentication
      #ssl.certificate: "/etc/pki/client/cert.pem"
    
      # Client Certificate Key
      #ssl.key: "/etc/pki/client/cert.key"
    
    #----------- Redis output -------------------
    output.redis:
      # Boolean flag to enable or disable the output module.
      enabled: true
    
      # The list of Redis servers to connect to. If load balancing is enabled, the
      # events are distributed to the servers in the list. If one server becomes
      # unreachable, the events are distributed to the reachable servers only.
      hosts: ["192.168.74.129:6379"]
    
      # The Redis port to use if hosts does not contain a port number. The default
      # is 6379.
      port: 6379
    
      # The name of the Redis list or channel the events are published to. The
      # default is filebeat.
      key: filebeat
    
      # The password to authenticate with. The default is no authentication.
      password: food
    
      # The Redis database number where the events are published. The default is 0.
      db: 0
    
      # The Redis data type to use for publishing events. If the data type is list,
      # the Redis RPUSH command is used. If the data type is channel, the Redis
      # PUBLISH command is used. The default value is list.
      datatype: list
    
      # The number of workers to use for each host configured to publish events to
      # Redis. Use this setting along with the loadbalance option. For example, if
      # you have 2 hosts and 3 workers, in total 6 workers are started (3 for each
      # host).
      worker: 1
    
      # If set to true and multiple hosts or workers are configured, the output
      # plugin load balances published events onto all Redis hosts. If set to false,
      # the output plugin sends all events to only one host (determined at random)
      # and will switch to another host if the currently selected one becomes
      # unreachable. The default value is true.
      loadbalance: true
    
      # The Redis connection timeout in seconds. The default is 5 seconds.
      timeout: 5s
    
      # The number of times to retry publishing an event after a publishing failure.
      # After the specified number of retries, the events are typically dropped.
      # Some Beats, such as Filebeat, ignore the max_retries setting and retry until
      # all events are published. Set max_retries to a value less than 0 to retry
      # until all events are published. The default is 3.
      #max_retries: 3
    
      # The maximum number of events to bulk in a single Redis request or pipeline.
      # The default is 2048.
      #bulk_max_size: 2048
    
      # The URL of the SOCKS5 proxy to use when connecting to the Redis servers. The
      # value must be a URL with a scheme of socks5://.
      #proxy_url:
    
      # This option determines whether Redis hostnames are resolved locally when
      # using a proxy. The default value is false, which means that name resolution
      # occurs on the proxy server.
      #proxy_use_local_resolver: false
    
      # Enable SSL support. SSL is automatically enabled, if any SSL setting is set.
      #ssl.enabled: true
    
      # Configure SSL verification mode. If `none` is configured, all server hosts
      # and certificates will be accepted. In this mode, SSL based connections are
      # susceptible to man-in-the-middle attacks. Use only for testing. Default is
      # `full`.
      #ssl.verification_mode: full
    
      # List of supported/valid TLS versions. By default all TLS versions 1.0 up to
      # 1.2 are enabled.
      #ssl.supported_protocols: [TLSv1.0, TLSv1.1, TLSv1.2]
    
      # Optional SSL configuration options. SSL is off by default.
      # List of root certificates for HTTPS server verifications
      #ssl.certificate_authorities: ["/etc/pki/root/ca.pem"]
    
      # Certificate for SSL client authentication
      #ssl.certificate: "/etc/pki/client/cert.pem"
    
      # Client Certificate Key
      #ssl.key: "/etc/pki/client/cert.key"
    
      # Optional passphrase for decrypting the Certificate Key.
      #ssl.key_passphrase: ''
    
      # Configure cipher suites to be used for SSL connections
      #ssl.cipher_suites: []
    
      # Configure curve types for ECDHE based cipher suites
      #ssl.curve_types: []
    
      # Configure what types of renegotiation are supported. Valid options are
      # never, once, and freely. Default is never.
      #ssl.renegotiation: never
    Filebeat收集数据到redis

    Kibana安装配置

       kibana是一个独立的web服务器 可以单独安装在任何一台主机上
       kibana首次打开页面手动指定加载elasticsearch集群中的哪些索引(数据库) 》 index pattern
       @timestamp 获取记录的生成时间按照这列的值来进行排序

       查看进程是否启动             ps  aux

      查看端口是否正确监听      ss  -tnl

    logstash正则匹配实例

    [[A-Z ]*][(?<logtime>[0-9]{1,4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3})].*[rid:(?<rid>[a-z0-9A-Zs.]*),sid:(?<sid>[a-z0-9A-Zs.]*),uid:(?<uid>[a-z0-9A-Zs.]*),tid:(?<tid>[a-z0-9A-Zs.]*),swjg:(?<swjg>[a-z0-9A-Zs.]*)] (?:timecost:(?<timecost>[0-9]*)){0,1},(?:url:(?<url>(.*?[^,]),)).*
    
    "url": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues,",
    
    
    [[A-Z ]*][(?<logtime>[0-9]{1,4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3})].*[rid:(?<rid>[a-z0-9A-Zs.]*),sid:(?<sid>[a-z0-9A-Zs.]*),uid:(?<uid>[a-z0-9A-Zs.]*),tid:(?<tid>[a-z0-9A-Zs.]*),swjg:(?<swjg>[a-z0-9A-Zs.]*)] (?:timecost:(?<timecost>[0-9]*)){0,1},(?:url:(?<url>(.*?[^,])),).*
    
    "url": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues",
    
    
    [[A-Z ]*][(?<logtime>[0-9]{1,4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3})].*[rid:(?<rid>[a-z0-9A-Zs.]*),sid:(?<sid>[a-z0-9A-Zs.]*),uid:(?<uid>[a-z0-9A-Zs.]*),tid:(?<tid>[a-z0-9A-Zs.]*),swjg:(?<swjg>[a-z0-9A-Zs.]*)] (?:timecost:(?<timecost>[0-9]*)){0,1},(((?:resturl):(?<resturl>(.*?[^,])),)|((?:url):(?<url>(.*?[^,])),)).*
    
    "url": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues",
    "resturl": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues",
    
    
    [[A-Z ]*][(?<logtime>[0-9]{1,4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3})].*[rid:(?<rid>[a-z0-9A-Zs.]*),sid:(?<sid>[a-z0-9A-Zs.]*),uid:(?<uid>[a-z0-9A-Zs.]*),tid:(?<tid>[a-z0-9A-Zs.]*),swjg:(?<swjg>[a-z0-9A-Zs.]*)] (?:timecost:(?<timecost>[0-9]*)){0,1},(((?:resturl):(?<resturl>(.*?[^,])),)|((?:url):(?<url>(.*?[^,])),)|.*).*
    
    "url": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues",
    "resturl": "http://99.13.82.233:8080/api/common/basecode/datamapinvalues",
    如果没有url或者resturl
    {
      "uid": "b1133",
      "swjg": "3232.2",
      "rid": "111",
      "logtime": "2018/09/19 11:39:00.098",
      "tid": "nh3211111.2",
      "timecost": "80",
      "sid": "222"
    }
    View Code
  • 相关阅读:
    PHP文件系统处理(二)
    PHP中的文件系统处理(一)
    PHP中常用正则表达式大全
    PHP中的正则表达式的使用
    SLF4J日志框架
    内部类
    计算机存储单位
    Maven 要点
    Maven 父类工程创建及引用
    Eclipse Maven Web项目创建
  • 原文地址:https://www.cnblogs.com/yxh168/p/9403923.html
Copyright © 2011-2022 走看看