zoukankan      html  css  js  c++  java
  • 提取网页链接

    package com.zyw.regex;
    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.Iterator;
    import java.util.LinkedHashMap;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class TestRegularExpression {
        public static void main(String[] args) {
            Map<UrlContent,Boolean> firstallUrl=new LinkedHashMap();
            Map<UrlContent,Boolean> secondallUrl=new LinkedHashMap();
            Pattern p=Pattern.compile("<a.*?href=["']?((https?://)?/?[^"']+)["']?.*?>(.+)</a>");//匹配整个<a></a>
            Pattern p1=Pattern.compile("(https?|ftp|http)://[a-zA-Z0-9]*.[a-zA-Z0-9]+.\w{2,3}/[\w\d-/.]*(?=")");//匹配url
            Pattern p2=Pattern.compile("(?<=>)[\w\su4e00-u9fa5]*(?=</a>)");//匹配<a></a>中内容
            addUrl(firstallUrl, "http://www.qq.com/", p, p1, p2);
            for (Iterator it = firstallUrl.keySet().iterator(); it.hasNext();) {
                UrlContent key = (UrlContent) it.next();
                addUrl(secondallUrl, key.getUrl(), p, p1, p2);
                if (secondallUrl.size() > 1000)
                    break;
            }
            int i = 0;
            for (UrlContent key : secondallUrl.keySet()) {
                System.out.println(++i + " " + key.getUrl() + " -----"+ key.getContent());
            }
    }
    
        public static void addUrl(Map<UrlContent, Boolean> allUrl,String link, Pattern p,Pattern p1, Pattern p2) { 
            try {
                URL url = new URL(link);
                InputStream in = url.openStream();
                InputStreamReader isr = new InputStreamReader(in, "utf-8");
                BufferedReader br = new BufferedReader(isr);
                String s = "";
                while ((s = br.readLine()) != null) {
                    Matcher m=p.matcher(s);
                    while (m.find()){
                        UrlContent content=new UrlContent();
                        String text=m.group();
                        Matcher m1=p1.matcher(text);
                        Matcher m2=p2.matcher(text);
                        while (m1.find()){
                            content.setUrl(m1.group());
                        }
                        while (m2.find()){
                            content.setContent(m2.group());
                        }
                        if(content.getUrl()!=null)
                        allUrl.put(content, false);
                    }
                    s = br.readLine();
                }
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    
    }
    package com.zyw.regex;
    
    public class UrlContent {
        private String url;
        private String content;
        public String getUrl() {
            return url;
        }
        public void setUrl(String url) {
            this.url = url;
        }
        public String getContent() {
            return content;
        }
        public void setContent(String content) {
            this.content = content;
        }
        
    }
  • 相关阅读:
    【24.17%】【codeforces 721D】Maxim and Array
    【26.42%】【codeforces 745C】Hongcow Builds A Nation
    【67.24%】【codeforces 745A】Hongcow Learns the Cyclic Shift
    【37.50%】【codeforces 745B】Hongcow Solves A Puzzle
    【78.89%】【codeforces 746A】Compote
    【75.28%】【codeforces 764B】Decoding
    【30.43%】【codeforces 746C】Tram
    【21.58%】【codeforces 746D】Green and Black Tea
    四种生成和解析XML文档的方法详解(介绍+优缺点比较+示例)
    xml常用四种解析方式优缺点的分析×××××
  • 原文地址:https://www.cnblogs.com/yunwuzhan/p/5454100.html
Copyright © 2011-2022 走看看