今天做企业异常发票分析,增值税发票详细那个数据文件特别大一个多g,一般方式根本打不开,放在一个编译器才,勉强打开,但是操作特别卡。
做这个异常分析是没有什么思路,就先按要求步骤进行。先进行数据清洗和数据导入。
先创建三个表,分别将三分数据进行导入
增值税发票表
Create table zzsfp(fp_nid string, xf_id string, gf_id string, je string, se string, jshj string, kpyf string, kprq string, zfbz string) Row format delimited fields terminated by ',';
load data local inpath '/opt/software/apache-hive-2.3.9-bin/zzsfp - 1.txt' into table zzsfp;
纳税人信息表
Create table nsrxx(hydm string, nsr_id string, djzclx_dm string, kydjrq string, xgrq string, label string) Row format delimited fields terminated by ',';
load data local inpath '/opt/software/apache-hive-2.3.9-bin/nsrxx1.txt' into table nsrxx;
货物明细表
Create table hwmx(fp_nid string, date_key string, hwmc string, ggxh string, dw string, sl double, dj double, je double, se double, spbm String) Row format delimited fields terminated by ','
load data local inpath '/opt/software/apache-hive-2.3.9-bin/zzsfp_hwmx1.txt' into table hwmx;
因为货物明细这个文件特别大是txt字符替换处理不了的,前两个表的每一行数据的括号可以用txt文本替换进行去除,这个表特别大,用文本替换直接就卡死了,我将文件分成三分来替换照样卡死,实在是太大了,需要采取别的手段
第一种编写java程序对源表进行清洗
import java.io.*;
public class Replace {
public static void main(String[] args) throws IOException {
String path = "D:\zzsfp_hwmx";
File file = new File(path);
StringBuilder result = new StringBuilder();
FileWriter writer = new FileWriter("D:\demo.txt");
try{
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));//构造一个BufferedReader类来读取文件
BufferedWriter bw = new BufferedWriter(writer);
String s = null;
while((s = br.readLine())!=null){//使用readLine方法,一次读一行
s= s.substring(1,s.length() - 1);
bw.write(s+'
');
}
br.close();
bw.close();
writer.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
写代码清洗反倒很快
第二种
Create table hwmx1(fp_nid string, date_key string, hwmc string, ggxh string, dw string, sl double, dj double, je double, se double, spbm String) Row format delimited fields terminated by ',';
insert overwrite table hwmx1
select translate(hwmx.fp_nid, '(','') as fp_nid, date_key , hwmc , ggxh , dw , sl , dj , je ,se, translate(hwmx.spbm, ')','') as spbm
from hwmx;
hive中使用translate来改变对应字段的字符串内容
通过选择和替换操作将括号剔除