最近在项目中使用到了如何处理爬虫抓取下来的文章内容编码处理的需求,经过研究之后发现如下方法:
package com.vito.jeeboot.modules.cms.utils; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.util.StringUtils; import org.springframework.web.util.HtmlUtils; import com.alibaba.fastjson.JSON; import com.vito.jeeboot.modules.cms.entity.Article; import com.vito.jeeboot.modules.cms.entity.ArticleData; import com.vito.jeeboot.modules.cms.entity.HttpResultEntity; import com.vito.jeeboot.modules.cms.entity.ResultBaseEntity; import com.vito.jeeboot.modules.cms.service.ArticleService; import com.vito.jeeboot.modules.spider.entity.JT023; /*** * jsoup抓取新闻 * @author Administrator * */ public class JsoupPerform { protected static Logger logger = LoggerFactory.getLogger(JsoupPerform.class); @Autowired private static String flagType = null; //网页编码 private static String ContentType = ""; /** * 抓取新闻 * @param jt023 * @return */ public static List<Article> parseHtmlForArticle(JT023 jt023){ List<Article> articleList = new ArrayList<>(); int i = 0; String articleHref = ""; String from = ""; String createTime = ""; String imageLink = ""; String desc = ""; String title = ""; try { CloseableHttpClient client = HttpClients.createDefault(); HttpGet get = new HttpGet(jt023.getC004()); HttpResponse response = client.execute(get); //ajax加载数据时,由于时延、所以渲染html需要时间,所以我们延时 Thread.sleep(20000); logger.info("网络响应码:" + response.getStatusLine().getStatusCode()); HttpEntity entity = response.getEntity(); //String content = EntityUtils.toString(entity, "utf-8"); //String content = EntityUtils.toString(entity); //开始判断页面编码类型 //ContentType = getContentType(content); // 使用Jsoup解析网页 byte[] bytes = EntityUtils.toByteArray(entity); String content = new String(bytes); // String content = EntityUtils.toString(entity); // 匹配<head></head>之间,出现在<meta>标签中的字符编码 Pattern pattern = Pattern.compile("<head>([\s\S]*?)<meta([\s\S]*?)charset\s*=(")?(.*?)""); Matcher matcher = pattern.matcher(content.toLowerCase()); if (matcher.find()) { String charset = matcher.group(4); flagType = charset; } if(flagType.equals("utf-8")){ content = new String(bytes,"utf-8"); } // 使用Jsoup解析网页 Document doc = Jsoup.parse(content); Elements indexMain = doc.select(jt023.getC016()); //根节点 Iterator<Element> blogIter = indexMain.iterator(); while (blogIter.hasNext()) { Element element = blogIter.next(); if(StringUtils.hasText(jt023.getC009())){ title = element.select(jt023.getC009() != null ? jt023.getC009() : "").html(); //标题 if(StringUtils.isEmpty(title)){ continue; } title = parseStringEncode(title); //暂时屏蔽编码转换的功能 } if(StringUtils.hasText(jt023.getC010())){ desc = element.select(jt023.getC010() != null ? jt023.getC010() : "").text(); //描述 desc = parseStringEncode(desc); //暂时屏蔽编码转换的功能 } if(StringUtils.hasText(jt023.getC014())){ imageLink = element.select(jt023.getC014() != null ? jt023.getC014() : "").attr("src"); //封面图片 } if(StringUtils.hasText(jt023.getC011())){ createTime = element.select(jt023.getC011() != null ? jt023.getC011() : "").text(); //创建时间 } if(StringUtils.hasText(jt023.getC012())){ from = element.select(jt023.getC012() != null ? jt023.getC012() : "").text(); //文章来源 from = parseStringEncode(from); //暂时屏蔽编码转换的功能 } if(StringUtils.hasText(jt023.getC018())){ articleHref = element.select(jt023.getC018() != null ? jt023.getC018() : "").attr("href"); //文章详情链接 } if(StringUtils.hasText(jt023.getC019())){//文章链接前缀不为空的时候,文章链接需要加前缀 articleHref = jt023.getC019() + articleHref; } String articleDetail = "<p>未获取到数据</p>"; if(StringUtils.hasText(articleHref)){ try { Document document = Jsoup.connect(articleHref).get(); Elements contentElements = document.select(jt023.getC017() != null ? jt023.getC017() : ""); //文章内容根节点 if(JT023.SPIDER_CHANGE_IMAGE_URL == jt023.getC015()){ for(Element element2 : contentElements){ //当新闻详情中的图片链接需要转换时 String imageUrl = element2.select("img").attr("src"); if (!StringUtils.isEmpty(imageUrl)) { element2.attr("src", jt023.getC024() + imageUrl); } } } articleDetail = contentElements.select(jt023.getC013() != null ? jt023.getC013() : "" ).html(); //文章详情 articleDetail = parseStringEncode(articleDetail); //暂时屏蔽编码转换的功能 if(!StringUtils.hasText(from)){ from = contentElements.select(jt023.getC012()).text(); //文章来源 from = parseStringEncode(from);//暂时屏蔽编码转换的功能 } } catch (Exception e) { logger.info("获取文章详情时出现异常:" + e.getMessage()); e.printStackTrace(); } } if(!StringUtils.hasText(articleDetail)){ articleDetail = title; } Article article = new Article(); ArticleData articleData=new ArticleData(); articleData.setContent(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape("<p>" + articleDetail.replaceAll("(?<=\>)(?:\s* ? ?)(?=\<)","") + "</p>"))); article.setCopyfrom(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(from))); article.setTitle(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(title)))); article.setDesc(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(desc)))); article.setArticleData(articleData); article.setRelationId("111111"); article.setRelationName("新闻资讯"); article.setOfficeCode("620103"); //七里河区的组织机构代码 article.setKeywords("3"); //关键字 article.setCategory(jt023.getC020()); //栏目 article.setCategoryName(jt023.getC021()); //栏目名称 article.setImage(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(HtmlUtils.htmlUnescape(imageLink)))); article.setRemarks(articleHref); //在栏目为直播时候、将链接放在该 if(!StringUtils.isEmpty(article.getTitle())){ logger.info(article.getTitle()); articleList.add(article); } //articleService.save(article); i++; logger.info("成功抓取到【 " + i + " 】 条数据"); } } catch (Exception e) { e.printStackTrace(); } return articleList; } /** * 获取页面编码 * @param content * @return */ public static String getContentType(String content){ Pattern pattern = Pattern.compile("<head>([\s\S]*?)<meta([\s\S]*?)charset\s*=(")?(.*?)""); Matcher matcher = pattern.matcher(content.toLowerCase()); if (matcher.find()) { String charset = matcher.group(4); ContentType = charset; }else{ ContentType = "utf-8"; } return ContentType; } /*** * 判断验证字符串的编码格式并修改 * @param oStr * @return */ public static String parseStringEncode(String oStr){ String nString = ""; String fileType = getEncoding(oStr); if("GB2312".equals(flagType.toUpperCase())){ nString = getGB2312ToUtf8(oStr); // 将gb2312字符串转换为utf-8 }else if("GBK".equals(flagType.toUpperCase())){ nString = getUTF8StringFromGBKString(oStr); //将gbk字符串转换为utf-8 }else if("ISO-8859-1".equals(flagType.toUpperCase())){ nString = getIsoToUtf8(oStr); }else{ nString = oStr; } return nString; } /*** * 爬虫接收接口形式的字符串 * @param jt023 * @return */ public static List<Article> parseDataFromPost(JT023 jt023){ List<Article> articleList = new ArrayList<>(); try { Connection connection = Jsoup.connect(jt023.getC004()) .header("Content-Type", "application/json") .header("Accept", "application/json") .header("X-Requested-With", "XMLHttpRequest") .ignoreContentType(true) .ignoreHttpErrors(true) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"); connection.requestBody(jt023.getC022()); //接口传参 String result = connection.post().text(); //当类型为 -- 云上七里河时, 将数据转化的导出至云裳七里河 if(JT023.SPIDER_POST == jt023.getC015()){ articleList = praseStringToResultBaseEntity(result , jt023); } } catch (Exception e) { e.printStackTrace(); } return articleList; } /** * 封装 -- 云上七里河爬虫信息 * @param result * @param jt023 * @return */ public static List<Article> praseStringToResultBaseEntity(String result , JT023 jt023){ List<Article> articleList = new ArrayList<>(); if(StringUtils.isEmpty(result)){ return articleList; } ResultBaseEntity rEntity = JSON.parseObject(result, ResultBaseEntity.class); int i = 0; if("0".equals(rEntity.getStatus())){ for(HttpResultEntity resultEntity : rEntity.getData()){ Article article = new Article(); ArticleData articleData=new ArticleData(); articleData.setContent(resultEntity.getRemark()); // 详情 article.setCopyfrom(resultEntity.getOrgName()); // 来源 -- 放的放的云上七里河 article.setTitle(resultEntity.getTopic()); //标题 article.setDesc(resultEntity.getRemark()); //描述 article.setArticleData(articleData); article.setRelationId("111111"); article.setRelationName("新闻资讯"); article.setCategory(jt023.getC020()); //栏目 article.setCategoryName(jt023.getC021()); //栏目名称 article.setImage(resultEntity.getCover()); //图标 article.setRemarks("https://live.xinhuaapp.com/xcy/reportlist.html?liveId=" + resultEntity.getId()); //在栏目为直播时候、将链接放在该 if(!StringUtils.isEmpty(article.getTitle())){ logger.info(article.getTitle()); articleList.add(article); i++; logger.info("成功获取到【 " + i + " 】 条数据"); } } } return articleList; } /*** * ISO-8859-1 转 utf-8 * @param str * @return */ public static String getIsoToUtf8(String str){ if (StringUtils.isEmpty(str)){ return ""; } String newStr = ""; try { newStr = new String(str.getBytes("ISO-8859-1")); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return newStr; } public static String getGB2312ToUtf8(String str){ if (StringUtils.isEmpty(str)){ return ""; } String newStr = ""; try { newStr = new String(str.getBytes("GB2312")); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return newStr; } /*** * 转换字体编码 gb2312 -> utf-8 * @param str * @return */ public static String gb2312ToUtf8(String str) { String urlEncode = "" ; try { urlEncode = URLEncoder.encode (str, "UTF-8" ); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return urlEncode; } /*** * gbk--> utf-8 * @param gbkStr * @return */ public static String getUTF8StringFromGBKString(String gbkStr) { try { return new String(gbkStr.getBytes("GBK")); } catch (UnsupportedEncodingException e) { throw new InternalError(); } } /** * gbk--> utf-8 * @param gbkStr * @return */ public static byte[] getUTF8BytesFromGBKString(String gbkStr) { int n = gbkStr.length(); byte[] utfBytes = new byte[3 * n]; int k = 0; for (int i = 0; i < n; i++) { int m = gbkStr.charAt(i); if (m < 128 && m >= 0) { utfBytes[k++] = (byte) m; continue; } utfBytes[k++] = (byte) (0xe0 | (m >> 12)); utfBytes[k++] = (byte) (0x80 | ((m >> 6) & 0x3f)); utfBytes[k++] = (byte) (0x80 | (m & 0x3f)); } if (k < utfBytes.length) { byte[] tmp = new byte[k]; System.arraycopy(utfBytes, 0, tmp, 0, k); return tmp; } return utfBytes; } /*** * 判断字符编码格式 * @param str * @return */ public static String getEncoding(String str){ String encode = "UTF-8"; try{ if(str.equals(new String(str.getBytes(encode),encode))){ String s2 = encode; return s2; } }catch(Exception exception2){ } encode = "GB2312"; try{ if(str.equals(new String(str.getBytes(encode),encode))){ String s = encode; return s; } }catch(Exception exception){ } encode = "ISO-8859-1"; try{ if(str.equals(new String(str.getBytes(encode),encode))){ String s1 = encode; return s1; } }catch(Exception exception1){ } encode = "GBK"; try{ if(str.equals(new String(str.getBytes(encode),encode))){ String s3 = encode; return s3; } }catch(Exception exception3){ } return ""; } }
在此做一记录,以备以后不时之需。