zoukankan      html  css  js  c++  java
  • java编写的一段简单的网络爬虫demo代码

    功能:

    从网站上下载附件,并从页面中提取页面文章内容

    关于NIO

    在大多数情况下,Java 应用程序并非真的受着 I/O 的束缚。操作系统并非不能快速传送
    数据,让 Java 有事可做;相反,是 JVM 自身在 I/O 方面效率欠佳。操作系统与 Java 基于流的 I/O
    模型有些不匹配。操作系统要移动的是大块数据(缓冲区),这往往是在硬件直接存储器存取
    (DMA)的协助下完成的。而 JVM 的 I/O 类喜欢操作小块数据——单个字节、几行文本。结果,
    操作系统送来整缓冲区的数据,java.io 的流数据类再花大量时间把它们拆成小块,往往拷贝一
    个小块就要往返于几层对象。操作系统喜欢整卡车地运来数据,java.io 类则喜欢一铲子一铲子
    地加工数据。有了 NIO,就可以轻松地把一卡车数据备份到您能直接使用的地方(ByteBuffer 对
    象)。

    以下代码使用了Java NIO以便提高文件读写效率。java NIO与原始IO差别,可以阅读《Java NIO》中文版了解。

    使用Xpath抓取文章中特定dom节点的内容。

    代码如下(已测试,可用,注意修改具体被爬行网站的接口):

    import com.fasterxml.jackson.databind.ObjectMapper;
    import com.google.common.collect.ImmutableMap;
    import com.google.common.io.ByteStreams;
    import lombok.extern.slf4j.Slf4j;
    import org.apache.commons.collections4.MapUtils;
    import org.apache.commons.lang3.RegExUtils;
    import org.apache.commons.lang3.StringUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClientBuilder;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.ssl.SSLContextBuilder;
    import org.apache.http.util.EntityUtils;
    import org.w3c.dom.Node;
    import org.w3c.dom.NodeList;
    import org.xml.sax.InputSource;
    
    import javax.net.ssl.SSLContext;
    import javax.xml.xpath.XPath;
    import javax.xml.xpath.XPathConstants;
    import javax.xml.xpath.XPathExpressionException;
    import javax.xml.xpath.XPathFactory;
    import java.io.BufferedWriter;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.StringReader;
    import java.nio.ByteBuffer;
    import java.nio.channels.FileChannel;
    import java.nio.charset.Charset;
    import java.nio.charset.StandardCharsets;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.Paths;
    import java.nio.file.StandardOpenOption;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.LinkedHashSet;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;
    import java.util.StringJoiner;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import java.util.stream.Collectors;
    
    
    @Slf4j
    public class Main {
        private static Pattern pattern = Pattern.compile("<div class="frame_subhead">[\s\S]*?</div>");
        private static Pattern pattern2 = Pattern.compile("<table class="form_table">[\s\S]*?</table>");
    
        String base = "http://172.16.3.122:9000";
    
        Set<String> attachments = new LinkedHashSet<>();
    
    
        public static void main(String[] args) throws IOException {
    
            Main bootstrap = new Main();
    
            String jsonArray = bootstrap.getResponseBody("http://172.16.3.122:9000/pdfs");
    
            log.info("爬虫程序已启动...");
    
            bootstrap.attachments.addAll(
                    bootstrap.getAttachments(jsonArray)
            );
    
            boolean succeed = bootstrap.login("admin", "123456");
    
            log.info("正在登陆网站获取cookie...");
    
            if (succeed) {
                List<String> sites = bootstrap.list();
                sites.forEach(site -> {
                    try {
                        bootstrap.crawl(site);
                    } catch (IOException | XPathExpressionException e) {
                        log.error("出错网站:{},{}",site,e);
                        System.err.println("出错网站:"+site);
                        e.printStackTrace();
                    }
                });
    
            }
        }
    
        List<String> getAttachments(String rawArray) throws IOException {
            ObjectMapper objectMapper = new ObjectMapper();
            String[] attachments = objectMapper.readValue(rawArray, String[].class);
            return Arrays.asList(attachments);
        }
    
        void crawl(String site) throws IOException, XPathExpressionException {
    
            Path path = Paths.get(String.format("d:/download/%s", site));
            char[] chars = path.getFileName().toFile().getName().toCharArray();
    
            if ((int) chars[0] != 8195) {
    
                if (!Files.exists(path)) {
                    Files.createDirectories(path);
                }
    
                downloadAttachment(site);
    
                List<Integer> array = Arrays.asList(3, 5, 6,7, 8);
                for (int i = 1; i <= 11; i++) {
                    ///昆山吉山会津塑料工业股份有限公司/html/1.html
                    String url = String.format("%s/%s/html/%d.html", base, site, i);
                    String html = getResponseBody(url);
                    if (StringUtils.isNotBlank(html)) {
                        String pattern = "tabContent_" + i;
                        int start = html.indexOf(pattern);
                        String title = extractSubTitle(start, html);
                        Path file = Paths.get(String.format("d:/download/%s/%s.txt", site, title));
                        if (array.contains(i)) {
                            saveFile(start, file, html);
                        }
                    }
                }
            }
        }
    
    
        void xQuery(String text,Path path) throws IOException, XPathExpressionException {
            String xml = text.substring(text.indexOf("<tbody>"));
    
            StringJoiner joiner = new StringJoiner("","<root>","</root>");
            InputSource inputXML = new InputSource( new StringReader( joiner.add(xml).toString() ) );
    
            XPath xPath = XPathFactory.newInstance().newXPath();
    
            NodeList tBodyNodes = (NodeList) xPath.evaluate("/root/tbody", inputXML, XPathConstants.NODESET);
    
            try (BufferedWriter writer = Files.newBufferedWriter(path, Charset.defaultCharset(), StandardOpenOption.CREATE)) {
                for (int i = 0; i < tBodyNodes.getLength(); i++) {
                    Node node = tBodyNodes.item(i);
    
                    NodeList trNodes = (NodeList) xPath.evaluate("tr", node, XPathConstants.NODESET);
                    for (int k = 0; k < trNodes.getLength(); k++) {
    
                        NodeList childList = (NodeList) xPath.evaluate("td", trNodes.item(k), XPathConstants.NODESET);
                        for (int j = 0; j < childList.getLength(); j++) {
                            Node child = childList.item(j);
                            String content = child.getTextContent();
    
                            writer.write(content);
                            if (j <childList.getLength() - 1) {
                                writer.write("	");
                            }
                        }
                        writer.write("
    ");
                    }
    
                    writer.write("
    ");
                }
            }
        }
    
        void saveFile(int start,Path path, String html) throws XPathExpressionException, IOException {
            Matcher matcher = pattern2.matcher(html);
            int step = 0;
            String tableText = "";
            while (step++ < 1 && matcher.find(start)) {
                tableText = RegExUtils.replacePattern(matcher.group(), "<table class="form_table">|</table>", "").trim();
            }
            xQuery(tableText,path);
        }
    
    
    
        void downloadAttachment(String site) {
            List<String> list = attachments.stream().filter(name -> name.startsWith(site)).collect(Collectors.toList());
    
            list.forEach(name -> {
    
                String filename = name.substring(name.lastIndexOf("/") + 1);
                log.info("正在下载 --{} -附件:{}", site, filename);
    
                String url = base + "/" + name;
                String dest = "d:/download/" + site + "/" + filename;
    
                Path file = Paths.get(dest).toAbsolutePath().normalize();
    
                if (!Files.exists(file)) {
    
                    Path path = file.getParent();
    
                    if (!Files.exists(path)) {
                        log.info("首次下载,正在创建目录:{}",path);
                        try {
                            Files.createDirectories(path);
                        } catch (IOException e) {
                            log.error("目录创建失败:{}",e);
                        }
                    }
    
                    log.info("正在保存采集来的附件,保存到:{}",file);
                    try (FileChannel fc = new FileOutputStream(dest).getChannel()) {
                        ByteBuffer buffer = getResponseAttachment(url);
                        fc.write(buffer);
                        log.info("文件{}已经成功保存",file);
                    } catch (IOException e) {
                        log.error("文件{}保存出错:{}",file,e);
                    }
                }
            });
    
        }
    
        List<String> list() throws IOException {
            String url = base + "/%E5%88%97%E8%A1%A8%E9%A1%B5%E9%9D%A2/%E6%B1%9F%E8%8B%8F%E7%9C%81%E9%AB%98%E6%96%B0%E6%8A%80%E6%9C%AF%E4%BC%81%E4%B8%9A%E8%BE%85%E5%8A%A9%E6%9D%90%E6%96%99%E6%8F%90%E4%BA%A4%E7%B3%BB%E7%BB%9F_files/Dir_Main.html";
            return Files.list(Paths.get("E:\pdf"))
                    .map(path -> path.getFileName().toFile().getName())
                    .filter(path -> (!path.startsWith(" ")) && !path.startsWith(" "))
                    .filter(dirName -> {
                        return !Arrays.asList("登录网页", "列表页面").contains(dirName);
                    }).collect(Collectors.toList());
        }
    
        boolean login(String username, String password) {
            String url = base + "/index.html";
            ImmutableMap<String, String> map = ImmutableMap.<String, String>builder()
                    .put("username", "admin")
                    .put("password", "123456")
                    .build();
            try {
                HttpResponse response = doPost(url, null, map);
                return true;
            } catch (IOException e) {
                log.error("登录出错:{}", e);
                ;
                return false;
            }
        }
    
    
        /**
         * 信任SSL证书
         *
         * @return
         */
        public CloseableHttpClient buildDefaultHttpClientTrustSSL() {
            SSLContext sslContext = null;
            try {
                sslContext = SSLContextBuilder.create().useProtocol(SSLConnectionSocketFactory.SSL).loadTrustMaterial((x, y) -> true).build();
            } catch (Exception e) {
                e.printStackTrace();
            }
            RequestConfig config = RequestConfig.custom()
                    .setSocketTimeout(30000)
                    .setConnectTimeout(30000)
                    .setConnectionRequestTimeout(30000)
                    .setContentCompressionEnabled(true)
                    .build();
            return HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sslContext).setSSLHostnameVerifier((x, y) -> true).build();
        }
    
    
        /***
         * 从响应的报文中提取网站标题
         * @param responseBody
         * @return
         */
        public String extractSubTitle(int start, String responseBody) {
            Matcher matcher = pattern.matcher(responseBody);
            int i = 0;
            String subHead = "";
            while (i++ < 1 && matcher.find(start)) {
                subHead = StringUtils.replacePattern(matcher.group(), "<div class="frame_subhead">|</div>", "").trim();
            }
    
            int offset1 = subHead.indexOf("、");
            if (offset1 >= 0) {
                subHead = subHead.substring(offset1 + 1);
            }
    
            return subHead;
    
        }
    
    
        public String extract(String body, String pattern) {
            Pattern regex = Pattern.compile(pattern);
            return "";
        }
    
    
        HttpResponse doGet(String url, Map<String, String> headerRefs) throws IOException {
            //巡检时更改为信任证书
            CloseableHttpClient httpClient = buildDefaultHttpClientTrustSSL();
    
            HttpGet httpGet = new HttpGet(url);
            httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)spider");
            httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
            httpGet.addHeader("Accept-Encoding", "gzip, deflate");
            httpGet.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
    
            if (MapUtils.isNotEmpty(headerRefs)) {
                for (Map.Entry<String, String> entry : headerRefs.entrySet()) {
                    String name = entry.getKey();
                    String value = entry.getValue();
                    httpGet.setHeader(name, value);
                }
            }
    
            return httpClient.execute(httpGet);
        }
    
        HttpResponse doPost(String url, Map<String, String> headerRefs, Map<String, String> data) throws IOException {
            //巡检时更改为信任证书
            CloseableHttpClient httpClient = buildDefaultHttpClientTrustSSL();
    
            HttpPost httpPost = new HttpPost(url);
            httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) spider");
            httpPost.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
            httpPost.addHeader("Accept-Encoding", "gzip, deflate");
            httpPost.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
    
            if (MapUtils.isNotEmpty(headerRefs)) {
                for (Map.Entry<String, String> entry : headerRefs.entrySet()) {
                    String name = entry.getKey();
                    String value = entry.getValue();
                    httpPost.setHeader(name, value);
                }
            }
    
            if (MapUtils.isNotEmpty(data)) {
    
                List<NameValuePair> nvps = new ArrayList<NameValuePair>();
    
                for (Map.Entry<String, String> entry : data.entrySet()) {
                    String name = entry.getKey();
                    String value = entry.getValue();
                    nvps.add(new BasicNameValuePair(name, value));
                }
                httpPost.setEntity(new UrlEncodedFormEntity(nvps));
            }
            return httpClient.execute(httpPost);
        }
    
    
        /***
         * 下载附件
         * @param url
         * @param headerRefs
         * @return
         * @throws IOException
         */
        ByteBuffer getResponseAttachment(String url, Map<String, String> headerRefs) throws IOException {
            HttpResponse response = doGet(url, headerRefs);
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                try (InputStream responseStream = entity.getContent()) {
                    byte[] targetArray = ByteStreams.toByteArray(responseStream);
                    ByteBuffer bufferByte = ByteBuffer.wrap(targetArray);
                    return bufferByte;
                }
            }
            return ByteBuffer.wrap(new byte[0]);
        }
    
        ByteBuffer getResponseAttachment(String url) throws IOException {
            return getResponseAttachment(url, null);
        }
    
        /***
         * 下载html响应报文主题(html代码)
         * @param url
         * @param headerRefs
         * @param charset
         * @return
         * @throws IOException
         */
        String getResponseBody(String url, Map<String, String> headerRefs, Charset charset) throws IOException {
            HttpResponse response = doGet(url, headerRefs);
    
            int status = response.getStatusLine().getStatusCode();
            if (status != 200) {
                return "";
            }
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                return EntityUtils.toString(entity, charset);
            }
            return "";
        }
    
        String getResponseBody(String url, Charset charset) throws IOException {
            return getResponseBody(url, null, charset);
        }
    
        String getResponseBody(String url) throws IOException {
            return getResponseBody(url, null, StandardCharsets.UTF_8);
        }
    
    }
  • 相关阅读:
    Fibonacci数列2
    足球队
    网页导航
    Catenyms
    某种密码
    大逃亡
    球的序列
    圆内三角形统计
    最小平方数

  • 原文地址:https://www.cnblogs.com/passedbylove/p/11683452.html
Copyright © 2011-2022 走看看