zoukankan      html  css  js  c++  java
  • 正则表达式提取URL

    package com.url;

    import java.io.IOException;
    import java.io.InputStream;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    public class Urlconnection {

    private static Pattern pattern = null;
    private static Matcher matcher = null;
    private static Pattern pattern1 = null;
    private static Matcher matcher1 = null;
    private static String regulation = "<\\s*a\\s*.*\\s*>.*<\\s*/\\s*a\\s*>|<\\s*a\\s*.*/\\s*>" ;
    private static String regulation1="\\bhref\\s*=\\s*(['\"])(.*?)\\1(\\s+|>)";
    //\\bhref\\s*=\\s*(['])(.*?)\\1\\b

    public static void main(String[] args) throws IOException {
    URL url = new URL("http://www.baidu.com/");
    URLConnection conn = url.openConnection() ;
    InputStream input = conn.getInputStream();
    String charater = conn.getContentEncoding();
    if(null == charater){
    charater = "GBK";
    }
    byte[] bytes = new byte[4096];
    int leng=-1 ;
    StringBuffer sb = new StringBuffer("");
    pattern = Pattern.compile(regulation,Pattern.CASE_INSENSITIVE|Pattern.MULTILINE|Pattern.DOTALL);
    pattern1 = Pattern.compile(regulation1,Pattern.CASE_INSENSITIVE|Pattern.MULTILINE|Pattern.DOTALL);
    while((leng=input.read(bytes))!=-1){
    sb.append(new String(bytes,charater)) ;
    matcher = pattern.matcher(sb.toString());
    while(matcher.find()){
    // System.out.println(matcher.group());
    String str = matcher.group() ;
    // str = str.replaceAll("\"", "'");
    matcher1 = pattern1.matcher(str);
    while(matcher1.find()){
    System.out.println(matcher1.group(2));
    }
    }
    sb.delete(0, sb.length());
    }



    }
    }

  • 相关阅读:
    修改Oracle数据库的字符集为UTF-8
    yum源的更新问题
    Spark的编译
    hadoop版本和位数的查看方法
    hadoop2.x通过Zookeeper来实现namenode的HA方案以及ResourceManager单点故障的解决方案
    zookeeper3.4.6的安装
    SQL SERVER回滚恢复误操作的数据
    SQLServer异常捕获
    111111
    sql分割函数
  • 原文地址:https://www.cnblogs.com/xinzhuangzi/p/4100397.html
Copyright © 2011-2022 走看看