zoukankan      html  css  js  c++  java
  • 正则表达式提取URL

    package com.url;

    import java.io.IOException;
    import java.io.InputStream;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    public class Urlconnection {

    private static Pattern pattern = null;
    private static Matcher matcher = null;
    private static Pattern pattern1 = null;
    private static Matcher matcher1 = null;
    private static String regulation = "<\\s*a\\s*.*\\s*>.*<\\s*/\\s*a\\s*>|<\\s*a\\s*.*/\\s*>" ;
    private static String regulation1="\\bhref\\s*=\\s*(['\"])(.*?)\\1(\\s+|>)";
    //\\bhref\\s*=\\s*(['])(.*?)\\1\\b

    public static void main(String[] args) throws IOException {
    URL url = new URL("http://www.baidu.com/");
    URLConnection conn = url.openConnection() ;
    InputStream input = conn.getInputStream();
    String charater = conn.getContentEncoding();
    if(null == charater){
    charater = "GBK";
    }
    byte[] bytes = new byte[4096];
    int leng=-1 ;
    StringBuffer sb = new StringBuffer("");
    pattern = Pattern.compile(regulation,Pattern.CASE_INSENSITIVE|Pattern.MULTILINE|Pattern.DOTALL);
    pattern1 = Pattern.compile(regulation1,Pattern.CASE_INSENSITIVE|Pattern.MULTILINE|Pattern.DOTALL);
    while((leng=input.read(bytes))!=-1){
    sb.append(new String(bytes,charater)) ;
    matcher = pattern.matcher(sb.toString());
    while(matcher.find()){
    // System.out.println(matcher.group());
    String str = matcher.group() ;
    // str = str.replaceAll("\"", "'");
    matcher1 = pattern1.matcher(str);
    while(matcher1.find()){
    System.out.println(matcher1.group(2));
    }
    }
    sb.delete(0, sb.length());
    }



    }
    }

  • 相关阅读:
    一分钟 解决Tomcat端口 占用问题
    Java 自定义注解
    Java 解析自定义XML文件
    Junit(手动/自动)加载
    Java思维题
    SSM框架中使用日志框架
    DAC
    SPI接口的FLASH
    晶振测试起振方法
    Jlink不报错的方法
  • 原文地址:https://www.cnblogs.com/xinzhuangzi/p/4100397.html
Copyright © 2011-2022 走看看