把hdfs上的文件加载到hive分区表
version0.1:遍历目录下的所有文件,逐个文件加载,循环次数多,单个文件加载不利于发挥系统io性能
for file in `hadoop fs -ls /source_path/*/*/*|awk '{print $NF}'` do
#截取天
day={file:6:8}
#截取小时
hour=${file:14:2}
beeline -e
"load data inpath 'file' into table table_name partition(p1=day,p2=hour);"
done
version0.2:遍历每个小时子目录,批量文件加载,减少了循环次数,提升了系统io利用率
for file in `hadoop fs -ls -R /source_path/|awk '$NF~//*/[0-2][0-9]$/{print $NF}'` do #截取天 day={file:6:8} #截取小时 hour=${file:14:2} beeline -e "load data inpath '$file/*' into table table_name partition(p1=day,p2=hour);" done
version0.3:只需登录一次hive,在version0.2的基础上减少每次登录hive耗费的时间
for file in `hadoop fs -ls -R /source_path/|awk '$NF~//*/[0-2][0-9]$/{print $NF}'` do #截取天 day={file:6:8} #截取小时 hour=${file:14:2} str=$str;load data inpath '$file/*' into table table_name partition(p1=day,p2=hour)
done
beeline -e $str;