zoukankan      html  css  js  c++  java
  • 正则表达式_网页爬虫

    其实就一个程序用于在互联网中获取符合指定规则的数据。 

    爬取邮箱地址

     爬取本地中的文件:

    import java.io.BufferedReader;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class RegexTest {
    
        public static void main(String[] args) throws IOException {
    
            
            List<String> list = getMails();
            
            for(String mail : list){
                System.out.println(mail);
            }
        }
        
        
    
        public static List<String>  getMails() throws IOException{
            
            //1,读取源文件。
            BufferedReader bufr = new BufferedReader(new FileReader("F:\IO\mail.html"));
            
            //2,对读取的数据进行规则的匹配。从中获取符合规则的数据.
            String mail_regex = "\w+@\w+(\.\w+)+";
            
            List<String> list = new ArrayList<String>();
            
            
            Pattern p = Pattern.compile(mail_regex);
            
            String line = null;
            
            while((line=bufr.readLine())!=null){
                
                Matcher m = p.matcher(line);
                while(m.find()){
                    //3,将符合规则的数据存储到集合中。
                    list.add(m.group());
                }
                
            }
            return list;
            
        }
    }

     爬取网络中的:

    import java.io.BufferedReader;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class RegexTest {
    
        public static void main(String[] args) throws IOException {
    
            
            List<String> list = getMailsByWeb();
            
            for(String mail : list){
                System.out.println(mail);
            }
        }
        public static List<String> getMailsByWeb() throws IOException {
            
            URL url = new URL("http://192.168.1.100:8080/myweb/mail.html");
            
            BufferedReader bufIn = new BufferedReader(new InputStreamReader(url.openStream()));
                    
            //2,对读取的数据进行规则的匹配。从中获取符合规则的数据.
            String mail_regex = "\w+@\w+(\.\w+)+";
            
            List<String> list = new ArrayList<String>();
            
            
            Pattern p = Pattern.compile(mail_regex);
            
            String line = null;
            
            while((line=bufIn.readLine())!=null){
                
                Matcher m = p.matcher(line);
                while(m.find()){
                    //3,将符合规则的数据存储到集合中。
                    list.add(m.group());
                }
                
            }
            return list;
        }
    }
  • 相关阅读:
    51nod乘积之和
    Dell服务器安装OpenManage(OMSA)
    Nginx反向代理PHP
    搭建haproxy
    108. Convert Sorted Array to Binary Search Tree
    60. Permutation Sequence
    142. Linked List Cycle II
    129. Sum Root to Leaf Numbers
    118. Pascal's Triangle
    26. Remove Duplicates from Sorted Array
  • 原文地址:https://www.cnblogs.com/LO-ME/p/3603569.html
Copyright © 2011-2022 走看看