微博excel数据清洗(Java版)

zoukankan html css js c++ java

微博excel数据清洗(Java版)
微博数据清洗(Java版)
原创 2013年12月10日 10:58:24
大数据公益大学提供的一份数据，义务处理一下，原始数据是Excel，含有html标签，如下：

要求清洗掉html标签，和微博内容中的url地址。

主要分为两部分：

1.处理文本，清洗数据。

2.处理excel读写操作。

上代码：

ExcelUtil类，包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下：

[java] view plain copy

package dat.datadeal;



import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import java.util.Locale;

import java.util.logging.Level;

import java.util.logging.Logger;



import org.apache.poi.hssf.usermodel.HSSFCell;

import org.apache.poi.hssf.usermodel.HSSFCellStyle;

import org.apache.poi.hssf.usermodel.HSSFRow;

import org.apache.poi.hssf.usermodel.HSSFSheet;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.poi.ss.usermodel.Cell;

import org.apache.poi.ss.usermodel.DateUtil;

import org.apache.poi.ss.usermodel.Row;

import org.apache.poi.ss.usermodel.Sheet;

import org.apache.poi.ss.usermodel.Workbook;

import org.apache.poi.xssf.usermodel.XSSFWorkbook;



/**

*

* @author daT dev.tao@gmail.com

*2003,2007版excel读写工具

*/

public class ExcelUtil{



    /**

     * Excel文件读取

     * @param filePath

     * @return String[]存的是行，List存的是列。

     * 一个excel一次全部读入内存(Excel超大需要另行处理)

     */

    public  List<String[]> readExcel(String filePath) {

        List<String[]> dataList = new ArrayList<String[]>();

        boolean isExcel2003 = true;

        if (isExcel2007(filePath)) {

            isExcel2003 = false;

        }

        File file = new File(filePath);

        InputStream is = null;

        try {

            is = new FileInputStream(file);

        } catch (FileNotFoundException ex) {

            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);

        }

        Workbook wb = null;

        try {

            wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);

        } catch (IOException ex) {

            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);

        }

        Sheet sheet = wb.getSheetAt(0);

        int totalRows = sheet.getPhysicalNumberOfRows();

        int totalCells = 0;

        if (totalRows >= 1 && sheet.getRow(0) != null) {

            totalCells = sheet.getRow(0).getPhysicalNumberOfCells();

        }

        for (int r = 0; r < totalRows; r++) {

            Row row = sheet.getRow(r);

            if (row == null) {

                continue;

            }

            String[] rowList = new String[totalCells];

            for (int c = 0; c < totalCells; c++) {

                Cell cell = row.getCell(c);

                String cellValue = "";

                if (cell == null) {

                    rowList[c] = (cellValue);

                    continue;

                }

                cellValue = ConvertCellStr(cell, cellValue);

                rowList[c] = (cellValue);

            }

            dataList.add(rowList);

        }

        return dataList;

    }





    private String ConvertCellStr(Cell cell, String cellStr) {

        switch (cell.getCellType()) {

            case Cell.CELL_TYPE_STRING:

                // 读取String

                cellStr = cell.getStringCellValue().toString();

                break;

            case Cell.CELL_TYPE_BOOLEAN:

                // 得到Boolean对象的方法

                cellStr = String.valueOf(cell.getBooleanCellValue());

                break;

            case Cell.CELL_TYPE_NUMERIC:

                // 先看是否是日期格式

                if (DateUtil.isCellDateFormatted(cell)) {

                    // 读取日期格式

                    cellStr = formatTime(cell.getDateCellValue().toString());

                } else {

                    // 读取数字

                    cellStr = String.valueOf(cell.getNumericCellValue());

                }

                break;

            case Cell.CELL_TYPE_FORMULA:

                // 读取公式

                cellStr = cell.getCellFormula().toString();

                break;

        }

        return cellStr;

    }







    private boolean isExcel2007(String fileName) {

            return fileName.matches("^.+\.(?i)(xlsx)$");

     }



    private String formatTime(String s) {

        SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);

        Date date = null;

        try {

            date = sf.parse(s);

        } catch (ParseException ex) {

            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);

        }

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        String result = sdf.format(date);

        return result;

    }





    /**

     * Excel写操作,简单起见还是采用内存数据一次写入

     * @param filePath 输出文件路径名

     * @param dataList 输出文件内容，List<String>行  List列

     * @throws IOException

     */

    public  void writeExcel(String filePath,List<List<String>> dataList) throws IOException{

            HSSFWorkbook wb = new HSSFWorkbook();

            HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet

            // 表格样式

            HSSFCellStyle style = wb.createCellStyle();

            style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐

            // // 边框

            // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);

            // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);

            // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);

            // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);

            // //设置字体

            // HSSFFont f = wb.createFont();

            // f.setFontHeightInPoints((short)10);

            // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);

            // style.setFont(f);

            // //设置列宽

            // sheet.setColumnWidth((short)0, (short)9600);

            // sheet.setColumnWidth((short)1, (short)4000);

            // sheet.setColumnWidth((short)2, (short)8000);

            // sheet.setColumnWidth((short)3, (short)8000);



            // 在索引0的位置创建第一行



            for (int i = 0; i < dataList.size(); i++) {

                HSSFRow row = sheet.createRow(i);

                List<String> list = dataList.get(i);

                for (int j = 0; j < list.size(); j++) {

                    HSSFCell cell = row.createCell(j);

                    cell.setCellValue(list.get(j));

                    cell.setCellStyle(style);

                }

            }

            // 导出文件

            FileOutputStream fout = new FileOutputStream(filePath);

            wb.write(fout);

            fout.close();

    }



}

DataClean类，包含对html标签，信息中url的的清洗。

[java] view plain copy

package dat.datadeal;



import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



/**

*

* @author daT dev.tao@gmail.com

*

*/

public class DataClean {



    /**

     * 清洗html标签

     * @param inputString

     * @return

     */

    public static String delHtml(String inputString) {

        String htmlStr = inputString; // 含html标签的字符串

        String textStr = "";

        java.util.regex.Pattern p_script;

        java.util.regex.Matcher m_script;

        java.util.regex.Pattern p_html;

        java.util.regex.Matcher m_html;

        try {

            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式

            String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>

            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);

            m_script = p_script.matcher(htmlStr);

            htmlStr = m_script.replaceAll(""); // 过滤script标签

            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);

            m_html = p_html.matcher(htmlStr);

            htmlStr = m_html.replaceAll(""); // 过滤html标签

            textStr = htmlStr;

        } catch (Exception e) {

            System.err.println("Html2Text: " + e.getMessage());

        }

        return textStr;// 返回文本字符串

    }



    /**

     * 处理掉信息中的url地址

     */

    public static String dealWithUrl(String str){

        String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";

        Pattern p = Pattern.compile(regEx);

        Matcher m = p.matcher(str);

        return m.replaceAll("");

    }





    public static void main(String[] args) throws IOException{

        ExcelUtil excelUtil = new ExcelUtil();

        List<List<String>> writeList = new ArrayList<List<String>>();

        List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");

        for(String[] lineArray:readList){

            List<String> strList = new ArrayList<String>();

            for(String str:lineArray){

                String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));

                strList.add(strTmp);

                //System.out.println(strTmp);

            }

            writeList.add(strList);

        }



        excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);

        System.out.println("job has finished...........");

    }

}

清洗后数据：
查看全文

相关阅读:
吴恩达机器学习笔记
 三个水杯
 架构之美读书笔记05
架构之美读书笔记04
架构之美读书笔记03
架构之美读书笔记02
架构之美读书笔记01
《需求工程——软件建模与分析》阅读笔记03
《需求工程——软件建模与分析》阅读笔记02
16下学期进度条2

原文地址：https://www.cnblogs.com/gaochsh/p/7803256.html

微博excel数据清洗(Java版)

微博数据清洗(Java版)