zoukankan      html  css  js  c++  java
  • java爬虫抓取腾讯漫画评论

    package com.eteclab.wodm.utils;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.Executor;
    import java.util.concurrent.Executors;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class MySearchTest2 {
    	private final static Executor executor = Executors.newCachedThreadPool();// 启用多线程
    	private static String mainUrl = "http://ac.qq.com/Comic/index/state/pink/page/";// 可以根据腾讯漫画的分类来进行抓取
    	private static String url = "http://ac.qq.com/Jump";// +/Comic/comicInfo/id/11111
    														// 可以获取具体的漫画页面
    
    	public static void main(String[] args) {
    		for (int i = 1; i <= 144; i++) {//可以分析漫画的总页数来进行调用
    			final int j = i;
    			executor.execute(new Runnable() {
    				@Override
    				public void run() {
    					try {
    						System.out.println("begin*************第" + j + "页");
    						getArticleListFromUrl(mainUrl + j, j);
    						System.out.println("end*************第" + j + "页");
    					} catch (Exception e) {
    						System.err.println("**********************获取漫画错误**********************");
    						e.printStackTrace();
    					}
    				}
    			});
    
    		}
    	}
    
    	/**
    	 * 获取日漫列表
    	 * 
    	 * @param listurl
    	 */
    	public static void getArticleListFromUrl(String listurl, int j) {
    		Document doc = null;
    		try {
    			doc = Jsoup
    					.connect(listurl)
    					.userAgent(
    							"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
    					.timeout(3000).get();
    		} catch (IOException e) {
    			System.err.println("**********************获取评论请求错误**********************");
    			e.printStackTrace();
    		}
    		// System.out.println(doc);
    		Elements elements = doc.getElementsByTag("a");// 找到所有a标签
    		for (Element element : elements) {
    			String relHref = element.attr("href"); // ==
    													// "/"这个是href的属性值,一般都是链接。这里放的是漫画的连接
    			String linkHref = element.text();
    			// 用if语句过滤掉不是漫画链接的内容
    			if (!relHref.startsWith("http://")
    					&& relHref.contains("/Comic/comicInfo/id")) {
    				StringBuffer sb = new StringBuffer();
    				sb.append(url).append(relHref);
    				String id = sb.substring(sb.lastIndexOf("/") + 1);
    				try {
    					for (int i = 1; i <= 50; i++) {//默认取50页评论
    						getArticleFromUrl(sb.toString(), Integer.valueOf(id),
    								i, j);// 查询第i页的评论
    					}
    				} catch (Exception e) {
    					// TODO: handle exception
    					System.err.println("**********************获取评论分页错误**********************");
    					e.printStackTrace();
    				}
    			}
    		}
    
    	}
    
    	/**
    	 * 获取评论内容,调用评论接口主要就是要获取漫画页面的cookies信息,调用时一起传过去;
    	 * 
    	 * @param detailurl
    	 *            评论的url
    	 * @param id
    	 *            资源id
    	 * @param page
    	 *            评论页数
    	 */
    	public static void getArticleFromUrl(String detailurl, Integer id,
    			Integer page, Integer mainIndex) {
    		try {
    			long i = System.currentTimeMillis();// 生成时间戳
    			Connection connect = Jsoup
    					.connect("http://ac.qq.com/Community/topicList?targetId="
    							+ id + "&page=" + page + "&_=" + i);
    			Map<String, String> header = new HashMap<String, String>();
    			header.put("User-Agent",
    					"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0");
    			header.put(
    					"Referer",
    					"http://ac.qq.com/Comic/ComicInfo/id/530132?trace_id=1_907_10.194.156.134_1504854317");
    			header.put(
    					"Cookie",
    					"LW_uid=q19499A3B6c0z0Y4z0k5h18046; pgv_pvid=8070181612; eas_sid=11O4U96326x0V0r4j0I5b2c073; pgv_pvi=623979520"
    							+ "; pt2gguin=o0877101804; RK=zfdTLMzqZc; ptcz=264e6df783796823cf379b14e6aef6aa3be6a4e2fb4b6126692ee05c2a0b0c4c"
    							+ "; ue_ts=1493600756; ue_uk=a058f8c6bbbe035c75bece7707297348; ue_uid=e5fb4837d184233402086deba8d197aa;"
    							+ " ue_skey=0e157906ef4cb8f560768be75c751a72; LW_pid=7813c0ffd4b168e438f4a5a82ad1c993; ts_uid=2015751548"
    							+ "; ts_refer=www.baidu.com/link; theme=white; roastState=2; readRecord=%5B%5B505430%2C%22%E8%88%AA%E6%B5"
    							+ "%B7%E7%8E%8B%22%2C888%2C%22%E7%AC%AC871%E8%AF%9D%20%E5%8A%A0%E6%B2%B9%E5%95%8A%EF%BC%81%E5%87%AF%E6%92"
    							+ "%92%EF%BC%81%EF%BC%81%22%2C871%5D%5D; readLastRecord=%5B%5D; pgv_si=s8053975040; pgv_info=ssid=s26281936"
    							+ "; ts_last=ac.qq.com/Comic/ComicInfo/id/530132; girlHideState=1; topicPop=1; pc_userinfo_cookie=; o_cookie"
    							+ "=877101804");
    			Connection data = connect.headers(header);
    			Document document = data.get();
    			Elements elements = document.getElementsByAttributeValue("class",
    					"comment-content-detail");
    			List<String> commList = new ArrayList<String>();
    			for (Element element : elements) {
    				commList.add(element.text());
    			}
    
    			/*JSONArray json = new JSONArray();
    			for (int j = 0; j < commList.size(); j++) {
    				JSONObject jo = new JSONObject();
    				jo.put("comment", commList.get(j));
    				json.add(jo);
    			}
    			String comment = json.toString();*/
    			String comment = StringUtils.listToString(commList, '
    ');
    			String date = DateUtilsTool.getLongDate(new Date());
    			String indexString = formatCode(mainIndex.toString());
    			saveArticle(date +indexString , comment);
    
    		} catch (IOException e) {
    			System.err.println("**********************获取评论错误**********************");
    			e.printStackTrace();
    		}
    
    	}
    
    	private static String formatCode(String code) {
    		StringBuilder sb = new StringBuilder();
    		int a = 4 - code.length();
    		if (a < 0) {
    			throw new RuntimeException("formatCode错误 code超过9999");
    		}
    		for (int i = 0; i < a; i++) {
    			sb.append("0");
    		}
    		sb.append(code);
    		return sb.toString();
    	}
    
    	/**
    	 * 保存内容到本地
    	 * 
    	 * @param titile
    	 * @param content
    	 * @param blogName
    	 */
    	public static void saveArticle(String titile, String content) {
    		String filePath = "d:\MyLoadArticle\" + titile + ".txt";// 保存到本地的路径和文件名
    		File file = new File(filePath);
    		if (!file.getParentFile().exists()) {
    			file.getParentFile().mkdirs();
    		}
    		try {
    			file.createNewFile();
    		} catch (IOException e) {
    			System.err.println("*******************读取文件错误*******************");
    			e.printStackTrace();
    			
    		}
    		try {
    			FileWriter fw = new FileWriter(file, true);
    			BufferedWriter bw = new BufferedWriter(fw);
    			bw.write(content);
    			bw.flush();
    			bw.close();
    			fw.close();
    		} catch (IOException e) {
    			System.err.println("*******************写入文件错误*******************");
    			e.printStackTrace();
    		}
    
    	}
    
    }
    

      pom.xml文件需要添加的jar包

    <dependency>
    			<groupId>org.jsoup</groupId>
    			<artifactId>jsoup</artifactId>
    			<version>1.10.3</version>
    		</dependency>
    

      程序中的工具类:

    //时间转换类
    /**
    	 * 将Date日期转换成String长类型的yyyyMMddHHmmss
    	 * @author: Simon
    	 * @date: 2017年9月9日 上午9:40:39
    	 * @param date
    	 * @return
    	 */
    	public static String getLongDate(Date date) {
    		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd");
    		String ret = sdf.format(date);
    		return ret;
    	}
    //list根据给定的字符进行切割成String类型
    /**
    	 * list转string
    	 * @author: Simon
    	 * @date: 2017年9月9日 上午10:24:52
    	 * @param list
    	 * @param separator
    	 * @return
    	 */
    	 public static String listToString(List list, char separator) {  
    		 StringBuilder sb = new StringBuilder();  
    		 for (int i = 0; i < list.size(); i++) {  
    		     if (i == list.size() - 1) {  
    		     sb.append(list.get(i));  
    		     } else {  
    		     sb.append(list.get(i));  
    		     sb.append(separator);  
    		     }  
    		 }  
    		 return sb.toString();  
    		    }  
    

      

  • 相关阅读:
    精确率、召回率、F1 值、ROC、AUC 各自的优缺点是什么?
    使用sklearn做单机特征工程
    机器学习算法比较
    CNN-卷积层和池化层学习
    卷积神经网络概念与原理
    CNN(卷积神经网络)、RNN(循环神经网络)、DNN(深度神经网络)概念区分理解
    技能|三次简化一张图:一招理解LSTM/GRU门控机制
    【转】TensorFlow四种Cross Entropy算法实现和应用
    http://blog.csdn.net/u014595019/article/details/52805444
    神经网络和深度学习简史(三)
  • 原文地址:https://www.cnblogs.com/SimonHu1993/p/7504970.html
Copyright © 2011-2022 走看看