zoukankan      html  css  js  c++  java
  • MiseringThread.java 解析页面线程

    MiseringThread.java 解析页面线程

    http://injavawetrust.iteye.com

    package com.iteye.injavawetrust.miner;
    
    import java.util.Set;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    
    /**
     * 解析页面线程
     * @author InJavaWeTrust
     *
     */
    public class MiseringThread extends Thread  {
    	
    	private static final Log LOG = LogFactory.getLog(MiseringThread.class);
    	
    	private MinerConfig config = null;
    	
    	public MiseringThread(MinerConfig config) {
    		this.config = config;
    	}
    	
    	@Override
    	public void run() {
    		while (!MinerMonitorThread.done) {
    			misering();
    		}
    	}
    	
    	private synchronized void misering() {
    		Html html = MinerQueue.waitingMiseringPoll(); // 等待提取URL的分析页面出队列
    		if (null == html || MinerUtil.isBlank(html.getHtml())) {
    			return;
    		}
    		//当前页面深度<爬取深度 取出当前页面全部URL
    		if (html.getDepth() < config.getMaxDepth()) {
    			LOG.info("MiseringThread获取页面[" + html.getUrl() + "]下所有URL。。。。。。 当前线程 [" + Thread.currentThread().getName() + "]");
    			Set<String> urls = MinerUtil.getAllUrl(html.getUrl());
    			for(String url : urls){
    				if(null == url || url.equals("")){
    					continue;
    				}
    				if(url.substring(url.length() - 1, url.length()).equals("/")){
    					url = url.substring(0, url.length() - 1);
    				}
    				
    				MinerUrl minerUrl = new MinerUrl();
    				minerUrl.setUrl(url);
    				minerUrl.setDepth(html.getDepth() + 1); // 爬取深度+1
    				// 判断URL列表是否包含关键字
    				if(!MinerUtil.checkKeys(url, config.getKeys())){
    					continue;
    				}
    				// 添加到待访问队列,每个URL只访问一次
    				MinerQueue.addUnVisited(minerUrl);
    				// 将页面URL 添加到URL队列 保证每个URL只访问一次
    				MinerQueue.addUrlSet(minerUrl.getUrl()); 
    			}
    		}
    	}
    	
    }
    

    返回列表

  • 相关阅读:
    java抽象类
    java不支持多继承
    logback颜色
    @ConfigurationProperties、@Value、@PropertySource
    redis命令
    mac下安装rabbitmq
    mac下安装jmeter
    python TypeError: 'int' object is not callable 问题解决
    白炽灯串联发光问题_高中知识(原创)
    python 离散序列 样本数伸缩(原创)
  • 原文地址:https://www.cnblogs.com/muyuge/p/6152077.html
Copyright © 2011-2022 走看看