zoukankan      html  css  js  c++  java
  • MinerHtmlThread.java 爬取页面线程

    MinerHtmlThread.java 爬取页面线程

    package com.iteye.injavawetrust.miner;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    /**
     * 爬取页面线程
     * @author InJavaWetrust
     *
     */
    public class MinerHtmlThread extends Thread {
    	
    	private static final Log LOG = LogFactory.getLog(MinerHtmlThread.class);
    	
    	private MinerConfig config = null;
    	
    	public MinerHtmlThread(MinerConfig config) {
    		this.config = config;
    	}
    	
    	@Override
    	public void run() {
    		while (!MinerMonitorThread.done) {
    			minerHtml();
    		}
    	}
    	
    	public synchronized void minerHtml() {
    		MinerUrl minerUrl = MinerQueue.unVisitedPoll(); // 待访问出队列。
    		try {
    			//判断当前页面爬取深度
    			if(null == minerUrl || MinerUtil.isBlank(minerUrl.getUrl()) || minerUrl.getDepth() > config.getMaxDepth()) {
    				return;
    			}
    			//判断爬取页面URL是否包含http
    			if("http".contains(minerUrl.getUrl())) {
    				LOG.info("MinerHtmlThread当前爬取URL[" + minerUrl.getUrl() + "]没有http");
    				return;
    			}
    			LOG.info("MinerHtmlThread当前爬取页面[" + minerUrl.getUrl() + "]爬取深度[" + minerUrl.getDepth() + "] 当前线程 [" + Thread.currentThread().getName() + "]");
    			Connection conn = Jsoup.connect(minerUrl.getUrl());
    			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//配置模拟浏览器  
    			Document doc = conn.get();
    			String page = doc.html();
    			Html html = new Html();
    			html.setUrl(minerUrl.getUrl());
    			html.setHtml(page);
    			html.setDepth(minerUrl.getDepth());
    			
    			// 添加到存储队列
    			MinerQueue.addStore(html);
    			
    			// 已经爬取的页面 添加到等待提取URL的分析页面队列
    			MinerQueue.addWaitingMisering(html); 
    			
    		} catch(Exception e) {
    			LOG.info("MinerHtmlThread爬取页面失败 URL [" + minerUrl.getUrl() + "]");
    			LOG.info("MinerHtmlThreadError info [" + e.getMessage() + "]");
    		}
    		
    	}
    
    }
    

    返回列表

  • 相关阅读:
    leetcode — interleaving-string
    leetcode — unique-binary-search-trees-ii
    leetcode — unique-binary-search-trees
    leetcode — binary-tree-inorder-traversal
    leetcode — restore-ip-addresses
    poj 2774 Long Long Message
    bzoj 1031 [JSOI2007]字符加密Cipher
    BZOJ4554 HEOI2016游戏
    BZOJ4552 HEOI2016排序
    BZOJ4551 HEOI2016树
  • 原文地址:https://www.cnblogs.com/new0801/p/6146688.html
Copyright © 2011-2022 走看看