zoukankan      html  css  js  c++  java
  • 正则表达式抓取文件内容中的http链接地址

    转自:https://www.cnblogs.com/akiradunn/p/5855073.html

      1 import java.io.BufferedReader;
      2 
      3 import java.io.FileInputStream;
      4 
      5 import java.io.FileNotFoundException;
      6 
      7 import java.io.FileOutputStream;
      8 
      9 import java.io.IOException;
     10 
     11 import java.io.InputStreamReader;
     12 
     13 import java.net.HttpURLConnection;
     14 
     15 import java.net.MalformedURLException;
     16 
     17 import java.net.URL;
     18 
     19 import java.util.regex.Matcher;
     20 
     21 import java.util.regex.Pattern;
     22 
     23 //正则表达式抓取网页数据
     24 
     25 public class HtmlAddressCatch {
     26 
     27 
     28 public static void main(String[] args) {
     29 
     30   String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
     31   HtmlAddressCatch.getWebTextContent(webaddress);
     32   /*String localaddress = "D:\test\test.html";
     33   String targetaddress = "D:\test\http.txt";
     34   HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
     35     
     36 }
     37 
     38 //给定http链接抓取地址
     39 
     40 public static void getWebTextContent(String webaddress){
     41 
     42 try {
     43 
     44 URL url = new URL(webaddress);
     45 
     46 HttpURLConnection con = (HttpURLConnection)url.openConnection();
     47 
     48 FileOutputStream file = new FileOutputStream("D:	ext.txt");
     49 
     50 InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流,一次读取更多的字节
     51 
     52 BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节
     53 
     54 int i=0;
     55 
     56 String regex = "https?://w+.w+.w+";
     57 
     58 Pattern p = Pattern.compile(regex);
     59 
     60 while((i=packetreader.read())!=-1)
     61 
     62 {
     63 
     64 String str = packetreader.readLine();
     65 
     66 Matcher m = p.matcher(str);
     67 
     68 while(m.find())
     69 
     70 {
     71 
     72 file.write((m.group()+"
    ").getBytes());
     73 
     74 }
     75 
     76 }
     77 
     78 } catch (MalformedURLException e) {
     79 
     80 // TODO Auto-generated catch block
     81 
     82 e.printStackTrace();
     83 
     84 } catch (FileNotFoundException e) {
     85 
     86 // TODO Auto-generated catch block
     87 
     88 e.printStackTrace();
     89 
     90 } catch (IOException e) {
     91 
     92 // TODO Auto-generated catch block
     93 
     94 e.printStackTrace();
     95 
     96 }
     97 
     98 
     99 }
    100 
    101 
    102 // 从本地test.html文件抓取http链接和邮箱地址
    103 
    104 public static void getLocalTextContent(String localaddress,String targetaddress){
    105 
    106 try {
    107 
    108 FileInputStream reader = new FileInputStream(localaddress);
    109 
    110 FileOutputStream writer = new FileOutputStream(targetaddress);
    111 
    112 byte[] buf = new byte[200];
    113 
    114 int point = 0;
    115 
    116 //String regex = "https?://w+.w+.w+";http链接抓取
    117 
    118 String regex = "w+@w+.w+";//邮箱地址抓取
    119 
    120 Pattern p = Pattern.compile(regex);
    121 
    122 while((point=reader.read(buf))>0)
    123 
    124 {
    125 
    126 Matcher m = p.matcher(new String(buf));
    127 
    128 while(m.find())
    129 
    130 {
    131 
    132 writer.write((m.group()+"
    ").getBytes());
    133 
    134 }
    135 
    136 }
    137 
    138 } catch (FileNotFoundException e) {
    139 
    140 // TODO Auto-generated catch block
    141 
    142 e.printStackTrace();
    143 
    144 } catch (IOException e) {
    145 
    146 // TODO Auto-generated catch block
    147 
    148 e.printStackTrace();
    149 
    150 }
    151 
    152 }
    153 
    154 }
  • 相关阅读:
    平台升级至spring 4.3.0 运行稳定
    java过滤特殊字符的正则表达式
    xheditor-文件上传-java-支持html5-application/octet-stream
    java用正则方法验证文件名是否合法
    Java实现在线预览Word,Excel,Ppt文档
    为什么用freemarker视图?
    Java中判断String不为空的问题性能比较
    解决org.apache.velocity.exception.ResourceNotFoundException: Unable to find resource
    Java Swing 使用非本地字体
    第三方包jintellitype实现Java设置全局热键
  • 原文地址:https://www.cnblogs.com/sharpest/p/10390026.html
Copyright © 2011-2022 走看看