zoukankan      html  css  js  c++  java
  • MinerUtil.java 爬虫工具类

    MinerUtil.java 爬虫工具类

    package com.iteye.injavawetrust.miner;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;
    import java.util.TimeZone;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    /**
     * 爬虫工具类
     * @author InJavaWeTrust
     *
     */
    public class MinerUtil {
    	
    	private static final Log LOG = LogFactory.getLog(MinerUtil.class);
    	
    	public static long starTime = 0;
    	
    	/**
    	 * 判断是否为空
    	 * @param param
    	 * @return true-为空;false-非空
    	 */
    	public static boolean isBlank(String param) {
    		return (null == param || "".equals(param.trim())) ? true : false;
    	}
    	
    	/**
    	 * URL是否以html结尾
    	 * @param url
    	 * @return true-是;false-否
    	 */
    	public static boolean checkURL(String url) {
    		String html = url.substring(url.lastIndexOf(".") + 1);
    		return "html".equals(html) ? true : false;
    	}
    	/**
    	 * URL列表是否包含关键字
    	 * @param key 关键字
    	 * @param keys URL列表
    	 * @return true-是;false-否
    	 */
    	public static boolean checkKeys(String key, List<String> keys) {
    		boolean flag = false;
    		for(String k : keys) {
    			if(key.contains(k)){
    				flag = true;
    				break;
    			}
    		}
    		return flag;
    	}
    	
    	public static boolean isValidFileName(String fileName) {
    		if (fileName == null || fileName.length() > 255){
    			return false;
    		} else {
    			return fileName
    					.matches("[^\s\\/:\*\?\"<>\|](\x20|[^\s\\/:\*\?\"<>\|])*[^\s\\/:\*\?\"<>\|\.]$");
    		}
    	} 
    	
    	/**
    	 * 获取URL
    	 * @param url URL
    	 * @return URL
    	 */
    	public static Set<String> getAllUrl(String url){
    		Set<String> urls = new HashSet<String>();
    		try {
    			Connection conn = Jsoup.connect(url);
    			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器  
    			Document document = conn.timeout(5000).get();
    			Elements hrefs = document.select("a[href]");
    			Iterator<Element> hrefIter = hrefs.iterator();
    			while (hrefIter.hasNext()) {
    				Element href = hrefIter.next();
    				urls.add(href.attr("href"));
    			}
    		} catch (Exception e) {
    			LOG.info("获取URL出现异常,异常URL[" + url + "]");
    			LOG.info("异常信息[" + e.getMessage() + "]");
    		}
    		return urls;
    	}
    	
    	/**
    	 * 毫秒转换成hhmmss
    	 * @param ms 毫秒
    	 * @return hh:mm:ss
    	 */
    	public static String msToss(long ms) {
    		SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
    		formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
    		String ss = formatter.format(ms);
    		return ss;
    	}
    	
    	/**
    	 * 将html写入本地文件
    	 * @param htmlText html内容
    	 * @param htmlName html名称
    	 */
    	public static void getHtmlToLocal(Map<String, String> map){
    		Writer writer = null;
    		try {
    			String path = MinerConstanits.HTMLPATH + getToday();
    			makeDir(path);
    			writer = new OutputStreamWriter(new FileOutputStream(new File(path
    					+ File.separator + map.get("title"))), "UTF-8");
    			writer.write(map.get("html"));
    			writer.flush();
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} finally {
    			if (writer != null) {
    				try {
    					writer.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
    			}
    		}
    	}
    	/**
    	 * 文件名不能包含下列任何字符:<br>
    	 * /:*?"<>|
    	 * @param title 标题
    	 * @return 去掉文件名不能包含的字符
    	 */
    	public static String fileName(String title){
    		return title
    				.replaceAll("\\", "")
    				.replaceAll("/", "")
    				.replaceAll(":", "")
    				.replaceAll("\*", "")
    				.replaceAll("\?", "")
    				.replaceAll(""", "")
    				.replaceAll("<", "")
    				.replaceAll(">", "")
    				.replaceAll("\|", "");
    	}
    	/**
    	 * 获取当天日期
    	 * @return 当天日期
    	 */
    	public static String getToday(){
    		String result = "";
    		Date date = new Date();
    		result = format(date);
    		return result;
    	}
    	/**
    	 * 格式化日期
    	 * @param date 日期
    	 * @return yyyymmdd 日期
    	 */
    	public static String format(Date date){
    		String format = "yyyyMMdd";
    		SimpleDateFormat fmt = new SimpleDateFormat(format);
    		return fmt.format(date);
    	}
    	/**
    	 * 创建存储目录
    	 * @param path 存储目录
    	 */
    	public static void makeDir(String path) {
    		File file = new File(path);
    		if(!file.exists()){
    			file.mkdirs();
    			LOG.info("创建存储目录[" + path + "]");
    		}
    	}
    	
    	public static boolean checkBeforeStart(MinerConfig config) {
    		if(null == config){
    			LOG.info("config未配置!!!");
    			return false;
    		}
    		if(null == config.getKeys() || 0 == config.getKeys().size()){
    			LOG.info("包含关键字未配置!!!");
    			return false;
    		}
    		if(null == config.getStoreType()){
    			LOG.info("存储方式未配置!!!");
    			return false;
    		}
    		if(config.getMaxDepth() < 1){
    			LOG.info("爬取页面最大深度配置错误!!!");
    			return false;
    		}
    		if(config.getMinerHtmlThreadNum() < 1){
    			LOG.info("下载页面线程数配置错误!!!");
    			return false;
    		}
    		if(config.getMiseringThreadNum() < 1){
    			LOG.info("分析页面线程数配置错误!!!");
    			return false;
    		}
    		if(config.getMinserStoreThreadNum() < 1){
    			LOG.info("存储线程数配置错误!!!");
    			return false;
    		}
    		return true;
    	}
    	
    	public static void main(String[] args) {
    		String path = MinerConstanits.HTMLPATH + File.separator + getToday();
    		makeDir(path);
    //		System.out.println(getToday());
    //		String test = "http://my.163.com/2015/11/27/17763_578935.html";
    //		System.out.println(fileName(test));
    //		System.out.println(MinerUtil.isBlank(null));
    //		System.out.println(MinerUtil.isBlank(""));
    //		System.out.println(MinerUtil.isBlank(" "));
    //		System.out.println(MinerUtil.isBlank("bbb"));
    //		System.out.println(MinerUtil.isBlank(" bbb "));
    		
    //		String key = "http://www.jqu.net.cn";
    //		List<String> keys = new ArrayList<String>();
    //		keys.add("http://www.jqu.net.cn");
    //		System.out.println(MinerUtil.checkKeys(key, keys));
    	}
    
    }
    

    返回列表

  • 相关阅读:
    Linq聚合操作之Aggregate,Count,Sum,Distinct源码分析
    Linq分区操作之Skip,SkipWhile,Take,TakeWhile源码分析
    Linq生成操作之DefautIfEmpty,Empty,Range,Repeat源码分析
    Linq基础操作之Select,Where,OrderBy,ThenBy源码分析
    PAT 1152 Google Recruitment
    PAT 1092 To Buy or Not to Buy
    PAT 1081 Rational Sum
    PAT 1084 Broken Keyboard
    PAT 1077 Kuchiguse
    PAT 1073 Scientific Notation
  • 原文地址:https://www.cnblogs.com/muyuge/p/6152078.html
Copyright © 2011-2022 走看看