FLUME参考案例:https://blog.csdn.net/weixin_43326910/article/details/111713326
日志数据原信息:https://blog.csdn.net/weixin_42229056/article/details/82775490
SQLyog:https://blog.csdn.net/Sunshine_liang1/article/details/84400820
一丶启动hadoop高可用集群
zkServer.sh start (三台机器) start-all.sh 第一台机器
二丶启动flume采集数据
flume-ng agent --conf conf/ --conf-file /export/lastwork/flume/Taildir-web.conf --name a1 -Dflume.root.logger=INFO,console
hdfs dfs -rm -r /flume
三丶启动MYSQL服务
1 systemctl start mysqld.service
四、HIVE
1 hive
1 drop database weblog;
1 drop database weblog cascade;
采集方案如下:
1 a1.sources=r1 2 #数据沉淀,可以有多个,中间用,隔开 3 a1.sinks=k1 4 #传输管道,一般只有一个 5 a1.channels=c1 6 #采集消息的类型 7 a1.sources.r1.type=TAILDIR 8 #检查点文件的路径 9 a1.sources.r1.positionFile=/var/log/flume/taildir_position.json 10 #监控该目录下的文件 11 a1.sources.r1.filegroups=f1 f2 12 a1.sources.r1.filegroups.f1=/var/log/test1/example.log 13 a1.sources.r1.filegroups.f2=/var/log/test2/.*log.* 14 #配置channel 15 a1.channels.c1.type=memory 16 a1.channels.c1.capacity=100 17 a1.channels.c1.transactionCapacity=100 18 a1.channels.c1.keep-alive=10 19 a1.channels.c1.byteCapacity=0 20 #配置sink 21 #4.配置并描述sink 22 a1.sinks.k1.type=hdfs 23 #4.1. rollInterval、rollCount、rollSize 分别是每隔多少秒写一个文件、往文件中写入数据的最大次数、rollSize文件的大小。也就是达到任意一个条件hdfs上就会生成一个新文件,继续写入 24 a1.sinks.rollInterval=60 25 a1.sinks.rollCount=30000 26 a1.sinks.rollSize=10240 27 a1.sinks.idleTimeout=60 28 a1.sinks.k1.hdfs.round=true 29 a1.sinks.k1.hdfs.roundValue=10 30 a1.sinks.k1.hdfs.roundUnit=minuite 31 a1.sinks.k1.hdfs.useLocalTimeStamp=true 32 #沉淀的方式是控制台打印,这里是上传到hdfs 33 a1.sinks.k1.hdfs.path = hdfs://192.168.52.150:9000/flume/testLog.log 34 35 a1.sources.r1.channels=c1 36 a1.sinks.k1.channel=c1
将文件从hadoop上复制下来
1 hadoop fs -get /flume/testLog.log /export/lastwork/flumedata/
后面是自己虚拟机的路径
hive
1 create database weblog;
1 create table ods_weblog_origin( 2 valid string, 3 remote_addr string, 4 remote_user string, 5 time_local string, 6 request string, 7 status string, 8 body_bytes_sent string, 9 http_referer string, 10 http_user_agent string) 11 partitioned by (datestr string) 12 row format delimited 13 fields terminated by '