zoukankan      html  css  js  c++  java
  • 网络爬虫爬取邮箱,并将其存入xml中作为数据库

    package com.bjsxt.ly;

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.UnsupportedEncodingException;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    import org.dom4j.Document;
    import org.dom4j.DocumentHelper;
    import org.dom4j.Element;
    import org.dom4j.io.OutputFormat;
    import org.dom4j.io.XMLWriter;

    public class HelloSpider {
    public static void main(String[] args) throws Exception {
    //获取路径
    String path = System.getProperty("user.dir") + File.separator + "src" + File.separator + "postcode.xml";
    //邮政编码
    String postcode = "100088";
    //抓取网络信息
    CharSequence charSequence = webSpider("http://tool.cncn.com/youbian/" + postcode);
    //将抓取的信息通过正则表达式匹配,获取需要的内容
    List<String> list = regexpPostcode("([\u4e00-\u9fa5\w\(\)58-]+)(?=</li>)", 0, charSequence);
    //将爬取的数据存放至XML
    createXml(postcode, list, path);
    }

    /**
    * 创建XML文档
    * @param postcode
    * @param list
    * @param path
    * @throws IOException
    * @throws FileNotFoundException
    * @throws UnsupportedEncodingException
    */
    private static void createXml(String postcode, List<String> list, String path) throws UnsupportedEncodingException, FileNotFoundException, IOException {
    //创建根节点
    Element rootElement = DocumentHelper.createElement("postcodes");
    //开始创建子节点
    Element postcodeElement = DocumentHelper.createElement("postcode");
    postcodeElement.addAttribute("code", postcode);
    //遍历创建地址
    for (String address : list) {
    //创建节点
    Element addressElement = DocumentHelper.createElement("address");
    addressElement.setText(address);
    //添加节点
    postcodeElement.add(addressElement);
    }
    //拼接到根节点
    rootElement.add(postcodeElement);
    //开始创建文档对象模型
    Document document = DocumentHelper.createDocument(rootElement);
    //开始输出
    new XMLWriter(new FileOutputStream(path), OutputFormat.createPrettyPrint()).write(document);
    }

    /**
    * 正则表达式获取邮编
    * @param regex
    * @param flags
    * @param charSequence
    * @return
    */
    private static List<String> regexpPostcode(String regex, int flags, CharSequence charSequence) {
    //声明一个容器存放邮编地址
    List<String> list = new ArrayList<>();
    //获取模板
    Pattern pattern = Pattern.compile(regex, flags);
    //获取匹配器
    Matcher matcher = pattern.matcher(charSequence);
    //开始读取
    while (matcher.find()) {
    list.add(matcher.group());
    }
    //返回结果
    return list;
    }

    /**
    * 网络爬虫
    * @param spec
    * @return
    * @throws IOException
    */
    private static CharSequence webSpider(String spec) throws IOException {
    //获取URL地址
    URL url = new URL(spec);
    //获取连接
    URLConnection connection = url.openConnection();
    //伪装成浏览器
    connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0");
    //获取输入流
    InputStream inputStream = connection.getInputStream();
    //开始转换
    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "GBK"));
    //开始获取字符串
    StringBuffer buffer = new StringBuffer();
    String line = "";
    //开始遍历读取
    while ((line = reader.readLine()) != null) {
    //开始拼接字符串
    buffer.append(line);
    }
    //返回爬取的内容
    return buffer;
    }
    }

  • 相关阅读:
    解决curl请求字段中带中文出错的问题
    字体下载
    Qt在windows与Mac OS中获取执行程序版本号
    QProcess启动不了外部程序的问题
    (Qt5Core.dll)处有未经处理的异常: 请求了严重的程序退出。
    QLabel
    解决接收命令行参数的数据中丢失双引号的问题
    QFile保留指定位置的数据
    ubuntu下使用docker安装部署openstf
    远程连接安卓设备步骤
  • 原文地址:https://www.cnblogs.com/lgf428/p/5831472.html
Copyright © 2011-2022 走看看