微博数据清洗(Java版)
大数据公益大学提供的一份数据,义务处理一下,原始数据是Excel,含有html标签,如下:
要求清洗掉html标签,和微博内容中的url地址。
主要分为两部分:
1.处理文本,清洗数据。
2.处理excel读写操作。
上代码:
ExcelUtil类,包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下:
- package dat.datadeal;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Locale;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFCellStyle;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.ss.usermodel.Cell;
- import org.apache.poi.ss.usermodel.DateUtil;
- import org.apache.poi.ss.usermodel.Row;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.xssf.usermodel.XSSFWorkbook;
- /**
- *
- * @author daT dev.tao@gmail.com
- *2003,2007版excel读写工具
- */
- public class ExcelUtil{
- /**
- * Excel文件读取
- * @param filePath
- * @return String[]存的是行,List存的是列。
- * 一个excel一次全部读入内存(Excel超大需要另行处理)
- */
- public List<String[]> readExcel(String filePath) {
- List<String[]> dataList = new ArrayList<String[]>();
- boolean isExcel2003 = true;
- if (isExcel2007(filePath)) {
- isExcel2003 = false;
- }
- File file = new File(filePath);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Workbook wb = null;
- try {
- wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);
- } catch (IOException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Sheet sheet = wb.getSheetAt(0);
- int totalRows = sheet.getPhysicalNumberOfRows();
- int totalCells = 0;
- if (totalRows >= 1 && sheet.getRow(0) != null) {
- totalCells = sheet.getRow(0).getPhysicalNumberOfCells();
- }
- for (int r = 0; r < totalRows; r++) {
- Row row = sheet.getRow(r);
- if (row == null) {
- continue;
- }
- String[] rowList = new String[totalCells];
- for (int c = 0; c < totalCells; c++) {
- Cell cell = row.getCell(c);
- String cellValue = "";
- if (cell == null) {
- rowList[c] = (cellValue);
- continue;
- }
- cellValue = ConvertCellStr(cell, cellValue);
- rowList[c] = (cellValue);
- }
- dataList.add(rowList);
- }
- return dataList;
- }
- private String ConvertCellStr(Cell cell, String cellStr) {
- switch (cell.getCellType()) {
- case Cell.CELL_TYPE_STRING:
- // 读取String
- cellStr = cell.getStringCellValue().toString();
- break;
- case Cell.CELL_TYPE_BOOLEAN:
- // 得到Boolean对象的方法
- cellStr = String.valueOf(cell.getBooleanCellValue());
- break;
- case Cell.CELL_TYPE_NUMERIC:
- // 先看是否是日期格式
- if (DateUtil.isCellDateFormatted(cell)) {
- // 读取日期格式
- cellStr = formatTime(cell.getDateCellValue().toString());
- } else {
- // 读取数字
- cellStr = String.valueOf(cell.getNumericCellValue());
- }
- break;
- case Cell.CELL_TYPE_FORMULA:
- // 读取公式
- cellStr = cell.getCellFormula().toString();
- break;
- }
- return cellStr;
- }
- private boolean isExcel2007(String fileName) {
- return fileName.matches("^.+\.(?i)(xlsx)$");
- }
- private String formatTime(String s) {
- SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);
- Date date = null;
- try {
- date = sf.parse(s);
- } catch (ParseException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- String result = sdf.format(date);
- return result;
- }
- /**
- * Excel写操作,简单起见还是采用内存数据一次写入
- * @param filePath 输出文件路径名
- * @param dataList 输出文件内容,List<String>行 List列
- * @throws IOException
- */
- public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
- HSSFWorkbook wb = new HSSFWorkbook();
- HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet
- // 表格样式
- HSSFCellStyle style = wb.createCellStyle();
- style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐
- // // 边框
- // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);
- // //设置字体
- // HSSFFont f = wb.createFont();
- // f.setFontHeightInPoints((short)10);
- // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);
- // style.setFont(f);
- // //设置列宽
- // sheet.setColumnWidth((short)0, (short)9600);
- // sheet.setColumnWidth((short)1, (short)4000);
- // sheet.setColumnWidth((short)2, (short)8000);
- // sheet.setColumnWidth((short)3, (short)8000);
- // 在索引0的位置创建第一行
- for (int i = 0; i < dataList.size(); i++) {
- HSSFRow row = sheet.createRow(i);
- List<String> list = dataList.get(i);
- for (int j = 0; j < list.size(); j++) {
- HSSFCell cell = row.createCell(j);
- cell.setCellValue(list.get(j));
- cell.setCellStyle(style);
- }
- }
- // 导出文件
- FileOutputStream fout = new FileOutputStream(filePath);
- wb.write(fout);
- fout.close();
- }
- }
DataClean类,包含对html标签,信息中url的的清洗。
- package dat.datadeal;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- *
- * @author daT dev.tao@gmail.com
- *
- */
- public class DataClean {
- /**
- * 清洗html标签
- * @param inputString
- * @return
- */
- public static String delHtml(String inputString) {
- String htmlStr = inputString; // 含html标签的字符串
- String textStr = "";
- java.util.regex.Pattern p_script;
- java.util.regex.Matcher m_script;
- java.util.regex.Pattern p_html;
- java.util.regex.Matcher m_html;
- try {
- String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
- String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>
- p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
- m_script = p_script.matcher(htmlStr);
- htmlStr = m_script.replaceAll(""); // 过滤script标签
- p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
- m_html = p_html.matcher(htmlStr);
- htmlStr = m_html.replaceAll(""); // 过滤html标签
- textStr = htmlStr;
- } catch (Exception e) {
- System.err.println("Html2Text: " + e.getMessage());
- }
- return textStr;// 返回文本字符串
- }
- /**
- * 处理掉信息中的url地址
- */
- public static String dealWithUrl(String str){
- String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(str);
- return m.replaceAll("");
- }
- public static void main(String[] args) throws IOException{
- ExcelUtil excelUtil = new ExcelUtil();
- List<List<String>> writeList = new ArrayList<List<String>>();
- List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");
- for(String[] lineArray:readList){
- List<String> strList = new ArrayList<String>();
- for(String str:lineArray){
- String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
- strList.add(strTmp);
- //System.out.println(strTmp);
- }
- writeList.add(strList);
- }
- excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);
- System.out.println("job has finished...........");
- }
- }
清洗后数据: