zoukankan      html  css  js  c++  java
  • MinerUtil.java 爬虫工具类

    MinerUtil.java 爬虫工具类

    package com.iteye.injavawetrust.miner;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;
    import java.util.TimeZone;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    /**
     * 爬虫工具类
     * @author InJavaWeTrust
     *
     */
    public class MinerUtil {
    	
    	private static final Log LOG = LogFactory.getLog(MinerUtil.class);
    	
    	public static long starTime = 0;
    	
    	/**
    	 * 判断是否为空
    	 * @param param
    	 * @return true-为空;false-非空
    	 */
    	public static boolean isBlank(String param) {
    		return (null == param || "".equals(param.trim())) ? true : false;
    	}
    	
    	/**
    	 * URL是否以html结尾
    	 * @param url
    	 * @return true-是;false-否
    	 */
    	public static boolean checkURL(String url) {
    		String html = url.substring(url.lastIndexOf(".") + 1);
    		return "html".equals(html) ? true : false;
    	}
    	/**
    	 * URL列表是否包含关键字
    	 * @param key 关键字
    	 * @param keys URL列表
    	 * @return true-是;false-否
    	 */
    	public static boolean checkKeys(String key, List<String> keys) {
    		boolean flag = false;
    		for(String k : keys) {
    			if(key.contains(k)){
    				flag = true;
    				break;
    			}
    		}
    		return flag;
    	}
    	
    	public static boolean isValidFileName(String fileName) {
    		if (fileName == null || fileName.length() > 255){
    			return false;
    		} else {
    			return fileName
    					.matches("[^\s\\/:\*\?\"<>\|](\x20|[^\s\\/:\*\?\"<>\|])*[^\s\\/:\*\?\"<>\|\.]$");
    		}
    	} 
    	
    	/**
    	 * 获取URL
    	 * @param url URL
    	 * @return URL
    	 */
    	public static Set<String> getAllUrl(String url){
    		Set<String> urls = new HashSet<String>();
    		try {
    			Connection conn = Jsoup.connect(url);
    			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器  
    			Document document = conn.timeout(5000).get();
    			Elements hrefs = document.select("a[href]");
    			Iterator<Element> hrefIter = hrefs.iterator();
    			while (hrefIter.hasNext()) {
    				Element href = hrefIter.next();
    				urls.add(href.attr("href"));
    			}
    		} catch (Exception e) {
    			LOG.info("获取URL出现异常,异常URL[" + url + "]");
    			LOG.info("异常信息[" + e.getMessage() + "]");
    		}
    		return urls;
    	}
    	
    	/**
    	 * 毫秒转换成hhmmss
    	 * @param ms 毫秒
    	 * @return hh:mm:ss
    	 */
    	public static String msToss(long ms) {
    		SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
    		formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
    		String ss = formatter.format(ms);
    		return ss;
    	}
    	
    	/**
    	 * 将html写入本地文件
    	 * @param htmlText html内容
    	 * @param htmlName html名称
    	 */
    	public static void getHtmlToLocal(Map<String, String> map){
    		Writer writer = null;
    		try {
    			String path = MinerConstanits.HTMLPATH + getToday();
    			makeDir(path);
    			writer = new OutputStreamWriter(new FileOutputStream(new File(path
    					+ File.separator + map.get("title"))), "UTF-8");
    			writer.write(map.get("html"));
    			writer.flush();
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} finally {
    			if (writer != null) {
    				try {
    					writer.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
    			}
    		}
    	}
    	/**
    	 * 文件名不能包含下列任何字符:<br>
    	 * /:*?"<>|
    	 * @param title 标题
    	 * @return 去掉文件名不能包含的字符
    	 */
    	public static String fileName(String title){
    		return title
    				.replaceAll("\\", "")
    				.replaceAll("/", "")
    				.replaceAll(":", "")
    				.replaceAll("\*", "")
    				.replaceAll("\?", "")
    				.replaceAll(""", "")
    				.replaceAll("<", "")
    				.replaceAll(">", "")
    				.replaceAll("\|", "");
    	}
    	/**
    	 * 获取当天日期
    	 * @return 当天日期
    	 */
    	public static String getToday(){
    		String result = "";
    		Date date = new Date();
    		result = format(date);
    		return result;
    	}
    	/**
    	 * 格式化日期
    	 * @param date 日期
    	 * @return yyyymmdd 日期
    	 */
    	public static String format(Date date){
    		String format = "yyyyMMdd";
    		SimpleDateFormat fmt = new SimpleDateFormat(format);
    		return fmt.format(date);
    	}
    	/**
    	 * 创建存储目录
    	 * @param path 存储目录
    	 */
    	public static void makeDir(String path) {
    		File file = new File(path);
    		if(!file.exists()){
    			file.mkdirs();
    			LOG.info("创建存储目录[" + path + "]");
    		}
    	}
    	
    	public static boolean checkBeforeStart(MinerConfig config) {
    		if(null == config){
    			LOG.info("config未配置!!!");
    			return false;
    		}
    		if(null == config.getKeys() || 0 == config.getKeys().size()){
    			LOG.info("包含关键字未配置!!!");
    			return false;
    		}
    		if(null == config.getStoreType()){
    			LOG.info("存储方式未配置!!!");
    			return false;
    		}
    		if(config.getMaxDepth() < 1){
    			LOG.info("爬取页面最大深度配置错误!!!");
    			return false;
    		}
    		if(config.getMinerHtmlThreadNum() < 1){
    			LOG.info("下载页面线程数配置错误!!!");
    			return false;
    		}
    		if(config.getMiseringThreadNum() < 1){
    			LOG.info("分析页面线程数配置错误!!!");
    			return false;
    		}
    		if(config.getMinserStoreThreadNum() < 1){
    			LOG.info("存储线程数配置错误!!!");
    			return false;
    		}
    		return true;
    	}
    	
    	public static void main(String[] args) {
    		String path = MinerConstanits.HTMLPATH + File.separator + getToday();
    		makeDir(path);
    //		System.out.println(getToday());
    //		String test = "http://my.163.com/2015/11/27/17763_578935.html";
    //		System.out.println(fileName(test));
    //		System.out.println(MinerUtil.isBlank(null));
    //		System.out.println(MinerUtil.isBlank(""));
    //		System.out.println(MinerUtil.isBlank(" "));
    //		System.out.println(MinerUtil.isBlank("bbb"));
    //		System.out.println(MinerUtil.isBlank(" bbb "));
    		
    //		String key = "http://www.jqu.net.cn";
    //		List<String> keys = new ArrayList<String>();
    //		keys.add("http://www.jqu.net.cn");
    //		System.out.println(MinerUtil.checkKeys(key, keys));
    	}
    
    }
    

    返回列表

  • 相关阅读:
    mongodb常用命令(转)
    C++位运算详解(转)
    C++Vector用法(转)
    php下载文件
    二维数组和指针(转)
    php数据采集(转)
    通过PHP实现浏览器点击下载TXT文档(转)
    Linux 文件颜色的含义
    如何在Linux下创建与解压zip, tar, tar.gz和tar.bz2文件【转】
    X11VNC:让Windows可以远程管理Ubuntu桌面
  • 原文地址:https://www.cnblogs.com/new0801/p/6146682.html
Copyright © 2011-2022 走看看