zoukankan      html  css  js  c++  java
  • java实现网络爬虫

    import java.io.IOException;  
    import java.util.HashSet;  
    import java.util.Set;  
    import java.util.regex.Matcher;  
    import java.util.regex.Pattern;  
    import org.jsoup.Jsoup;  
    import org.jsoup.nodes.Document;  
    import org.jsoup.nodes.Element;  
    import org.jsoup.select.Elements;  
     
    public class TestClass {  
        private static Set<String> urlSet = new HashSet<String>();  
        private static Pattern p = Pattern  
                .compile(  
                        "^(((http|https)://" +  
                        "(www.|([1-9]|[1-9]\d|1\d{2}|2[0-1]\d|25[0-5])" +  
                        "(\.(\d|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])){3}:[0-9]+/)?)" +  
                        "{1}.+){1}quot;,  
                        Pattern.CASE_INSENSITIVE);  
     
        public static void main(String[] args) {  
            String baseUrl = "http://www.sina.com";  
            spiderInternet(baseUrl, "");  
        }  
     
        private static void spiderInternet(String baseUrl, String exUrl) {  
            if (baseUrl.endsWith("/") && exUrl.startsWith("/")) {  
                baseUrl = baseUrl.substring(0, baseUrl.length() - 1);  
            }  
            String new_url = baseUrl + exUrl;  
            if (urlSet.contains(new_url)) {  
                return;  
            }  
            System.out.println(new_url);  
            try {  
                Document doc = Jsoup.connect(new_url).get();  
                urlSet.add(new_url);  
                Elements links = doc.select("a[href]");  
                for (Element link : links) {  
                    String linkHref = link.attr("href");  
                    if (linkHref.equals("#")) {  
                        return;  
                    }  
                    Matcher matcher = p.matcher(linkHref);  
                    if (matcher.matches()) {  
                        spiderInternet(linkHref, "");  
                    } else {  
                        spiderInternet(baseUrl, linkHref);  
                    }  
                }  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }  
    } 
  • 相关阅读:
    ADO.NET操作PostgreSQL:数据库操作类(未封装)
    ADO.NET操作SQL Server:数据库操作类(已封装)
    ADO.NET操作SQL Server:数据库操作类(未封装)
    h5微信支付
    css3的transform:tanslateZ没有效果
    判断页面是否被嵌入iframe里面
    vue项目的环境变量
    iOS下调用元素的focus方法,input元素不聚焦,键盘不弹起的问题
    关于common.js里面的module.exports与es6的export default的思考总结
    Vue2.4.0 新增的inheritAttrs,attrs
  • 原文地址:https://www.cnblogs.com/Jansens520/p/7825773.html
Copyright © 2011-2022 走看看