功能:
从网站上下载附件,并从页面中提取页面文章内容
关于NIO
在大多数情况下,Java 应用程序并非真的受着 I/O 的束缚。操作系统并非不能快速传送
数据,让 Java 有事可做;相反,是 JVM 自身在 I/O 方面效率欠佳。操作系统与 Java 基于流的 I/O
模型有些不匹配。操作系统要移动的是大块数据(缓冲区),这往往是在硬件直接存储器存取
(DMA)的协助下完成的。而 JVM 的 I/O 类喜欢操作小块数据——单个字节、几行文本。结果,
操作系统送来整缓冲区的数据,java.io 的流数据类再花大量时间把它们拆成小块,往往拷贝一
个小块就要往返于几层对象。操作系统喜欢整卡车地运来数据,java.io 类则喜欢一铲子一铲子
地加工数据。有了 NIO,就可以轻松地把一卡车数据备份到您能直接使用的地方(ByteBuffer 对
象)。
以下代码使用了Java NIO以便提高文件读写效率。java NIO与原始IO差别,可以阅读《Java NIO》中文版了解。
使用Xpath抓取文章中特定dom节点的内容。
代码如下(已测试,可用,注意修改具体被爬行网站的接口):
import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableMap; import com.google.common.io.ByteStreams; import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections4.MapUtils; import org.apache.commons.lang3.RegExUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.message.BasicNameValuePair; import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.util.EntityUtils; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import javax.net.ssl.SSLContext; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringJoiner; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @Slf4j public class Main { private static Pattern pattern = Pattern.compile("<div class="frame_subhead">[\s\S]*?</div>"); private static Pattern pattern2 = Pattern.compile("<table class="form_table">[\s\S]*?</table>"); String base = "http://172.16.3.122:9000"; Set<String> attachments = new LinkedHashSet<>(); public static void main(String[] args) throws IOException { Main bootstrap = new Main(); String jsonArray = bootstrap.getResponseBody("http://172.16.3.122:9000/pdfs"); log.info("爬虫程序已启动..."); bootstrap.attachments.addAll( bootstrap.getAttachments(jsonArray) ); boolean succeed = bootstrap.login("admin", "123456"); log.info("正在登陆网站获取cookie..."); if (succeed) { List<String> sites = bootstrap.list(); sites.forEach(site -> { try { bootstrap.crawl(site); } catch (IOException | XPathExpressionException e) { log.error("出错网站:{},{}",site,e); System.err.println("出错网站:"+site); e.printStackTrace(); } }); } } List<String> getAttachments(String rawArray) throws IOException { ObjectMapper objectMapper = new ObjectMapper(); String[] attachments = objectMapper.readValue(rawArray, String[].class); return Arrays.asList(attachments); } void crawl(String site) throws IOException, XPathExpressionException { Path path = Paths.get(String.format("d:/download/%s", site)); char[] chars = path.getFileName().toFile().getName().toCharArray(); if ((int) chars[0] != 8195) { if (!Files.exists(path)) { Files.createDirectories(path); } downloadAttachment(site); List<Integer> array = Arrays.asList(3, 5, 6,7, 8); for (int i = 1; i <= 11; i++) { ///昆山吉山会津塑料工业股份有限公司/html/1.html String url = String.format("%s/%s/html/%d.html", base, site, i); String html = getResponseBody(url); if (StringUtils.isNotBlank(html)) { String pattern = "tabContent_" + i; int start = html.indexOf(pattern); String title = extractSubTitle(start, html); Path file = Paths.get(String.format("d:/download/%s/%s.txt", site, title)); if (array.contains(i)) { saveFile(start, file, html); } } } } } void xQuery(String text,Path path) throws IOException, XPathExpressionException { String xml = text.substring(text.indexOf("<tbody>")); StringJoiner joiner = new StringJoiner("","<root>","</root>"); InputSource inputXML = new InputSource( new StringReader( joiner.add(xml).toString() ) ); XPath xPath = XPathFactory.newInstance().newXPath(); NodeList tBodyNodes = (NodeList) xPath.evaluate("/root/tbody", inputXML, XPathConstants.NODESET); try (BufferedWriter writer = Files.newBufferedWriter(path, Charset.defaultCharset(), StandardOpenOption.CREATE)) { for (int i = 0; i < tBodyNodes.getLength(); i++) { Node node = tBodyNodes.item(i); NodeList trNodes = (NodeList) xPath.evaluate("tr", node, XPathConstants.NODESET); for (int k = 0; k < trNodes.getLength(); k++) { NodeList childList = (NodeList) xPath.evaluate("td", trNodes.item(k), XPathConstants.NODESET); for (int j = 0; j < childList.getLength(); j++) { Node child = childList.item(j); String content = child.getTextContent(); writer.write(content); if (j <childList.getLength() - 1) { writer.write(" "); } } writer.write(" "); } writer.write(" "); } } } void saveFile(int start,Path path, String html) throws XPathExpressionException, IOException { Matcher matcher = pattern2.matcher(html); int step = 0; String tableText = ""; while (step++ < 1 && matcher.find(start)) { tableText = RegExUtils.replacePattern(matcher.group(), "<table class="form_table">|</table>", "").trim(); } xQuery(tableText,path); } void downloadAttachment(String site) { List<String> list = attachments.stream().filter(name -> name.startsWith(site)).collect(Collectors.toList()); list.forEach(name -> { String filename = name.substring(name.lastIndexOf("/") + 1); log.info("正在下载 --{} -附件:{}", site, filename); String url = base + "/" + name; String dest = "d:/download/" + site + "/" + filename; Path file = Paths.get(dest).toAbsolutePath().normalize(); if (!Files.exists(file)) { Path path = file.getParent(); if (!Files.exists(path)) { log.info("首次下载,正在创建目录:{}",path); try { Files.createDirectories(path); } catch (IOException e) { log.error("目录创建失败:{}",e); } } log.info("正在保存采集来的附件,保存到:{}",file); try (FileChannel fc = new FileOutputStream(dest).getChannel()) { ByteBuffer buffer = getResponseAttachment(url); fc.write(buffer); log.info("文件{}已经成功保存",file); } catch (IOException e) { log.error("文件{}保存出错:{}",file,e); } } }); } List<String> list() throws IOException { String url = base + "/%E5%88%97%E8%A1%A8%E9%A1%B5%E9%9D%A2/%E6%B1%9F%E8%8B%8F%E7%9C%81%E9%AB%98%E6%96%B0%E6%8A%80%E6%9C%AF%E4%BC%81%E4%B8%9A%E8%BE%85%E5%8A%A9%E6%9D%90%E6%96%99%E6%8F%90%E4%BA%A4%E7%B3%BB%E7%BB%9F_files/Dir_Main.html"; return Files.list(Paths.get("E:\pdf")) .map(path -> path.getFileName().toFile().getName()) .filter(path -> (!path.startsWith(" ")) && !path.startsWith(" ")) .filter(dirName -> { return !Arrays.asList("登录网页", "列表页面").contains(dirName); }).collect(Collectors.toList()); } boolean login(String username, String password) { String url = base + "/index.html"; ImmutableMap<String, String> map = ImmutableMap.<String, String>builder() .put("username", "admin") .put("password", "123456") .build(); try { HttpResponse response = doPost(url, null, map); return true; } catch (IOException e) { log.error("登录出错:{}", e); ; return false; } } /** * 信任SSL证书 * * @return */ public CloseableHttpClient buildDefaultHttpClientTrustSSL() { SSLContext sslContext = null; try { sslContext = SSLContextBuilder.create().useProtocol(SSLConnectionSocketFactory.SSL).loadTrustMaterial((x, y) -> true).build(); } catch (Exception e) { e.printStackTrace(); } RequestConfig config = RequestConfig.custom() .setSocketTimeout(30000) .setConnectTimeout(30000) .setConnectionRequestTimeout(30000) .setContentCompressionEnabled(true) .build(); return HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sslContext).setSSLHostnameVerifier((x, y) -> true).build(); } /*** * 从响应的报文中提取网站标题 * @param responseBody * @return */ public String extractSubTitle(int start, String responseBody) { Matcher matcher = pattern.matcher(responseBody); int i = 0; String subHead = ""; while (i++ < 1 && matcher.find(start)) { subHead = StringUtils.replacePattern(matcher.group(), "<div class="frame_subhead">|</div>", "").trim(); } int offset1 = subHead.indexOf("、"); if (offset1 >= 0) { subHead = subHead.substring(offset1 + 1); } return subHead; } public String extract(String body, String pattern) { Pattern regex = Pattern.compile(pattern); return ""; } HttpResponse doGet(String url, Map<String, String> headerRefs) throws IOException { //巡检时更改为信任证书 CloseableHttpClient httpClient = buildDefaultHttpClientTrustSSL(); HttpGet httpGet = new HttpGet(url); httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)spider"); httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); httpGet.addHeader("Accept-Encoding", "gzip, deflate"); httpGet.addHeader("Accept-Language", "zh-CN,zh;q=0.9"); if (MapUtils.isNotEmpty(headerRefs)) { for (Map.Entry<String, String> entry : headerRefs.entrySet()) { String name = entry.getKey(); String value = entry.getValue(); httpGet.setHeader(name, value); } } return httpClient.execute(httpGet); } HttpResponse doPost(String url, Map<String, String> headerRefs, Map<String, String> data) throws IOException { //巡检时更改为信任证书 CloseableHttpClient httpClient = buildDefaultHttpClientTrustSSL(); HttpPost httpPost = new HttpPost(url); httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) spider"); httpPost.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); httpPost.addHeader("Accept-Encoding", "gzip, deflate"); httpPost.addHeader("Accept-Language", "zh-CN,zh;q=0.9"); if (MapUtils.isNotEmpty(headerRefs)) { for (Map.Entry<String, String> entry : headerRefs.entrySet()) { String name = entry.getKey(); String value = entry.getValue(); httpPost.setHeader(name, value); } } if (MapUtils.isNotEmpty(data)) { List<NameValuePair> nvps = new ArrayList<NameValuePair>(); for (Map.Entry<String, String> entry : data.entrySet()) { String name = entry.getKey(); String value = entry.getValue(); nvps.add(new BasicNameValuePair(name, value)); } httpPost.setEntity(new UrlEncodedFormEntity(nvps)); } return httpClient.execute(httpPost); } /*** * 下载附件 * @param url * @param headerRefs * @return * @throws IOException */ ByteBuffer getResponseAttachment(String url, Map<String, String> headerRefs) throws IOException { HttpResponse response = doGet(url, headerRefs); HttpEntity entity = response.getEntity(); if (entity != null) { try (InputStream responseStream = entity.getContent()) { byte[] targetArray = ByteStreams.toByteArray(responseStream); ByteBuffer bufferByte = ByteBuffer.wrap(targetArray); return bufferByte; } } return ByteBuffer.wrap(new byte[0]); } ByteBuffer getResponseAttachment(String url) throws IOException { return getResponseAttachment(url, null); } /*** * 下载html响应报文主题(html代码) * @param url * @param headerRefs * @param charset * @return * @throws IOException */ String getResponseBody(String url, Map<String, String> headerRefs, Charset charset) throws IOException { HttpResponse response = doGet(url, headerRefs); int status = response.getStatusLine().getStatusCode(); if (status != 200) { return ""; } HttpEntity entity = response.getEntity(); if (entity != null) { return EntityUtils.toString(entity, charset); } return ""; } String getResponseBody(String url, Charset charset) throws IOException { return getResponseBody(url, null, charset); } String getResponseBody(String url) throws IOException { return getResponseBody(url, null, StandardCharsets.UTF_8); } }