zoukankan      html  css  js  c++  java
  • Java中Web页面信息获取

    package com.imooc.regex;
    
    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.InputStreamReader;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class RegexSample {
    
        public static void main(String[] args) {
            StringBuilder content = new StringBuilder();
            try {
                FileInputStream fis = new FileInputStream("D:\eclipse-workspace\regex\WebContent\sample.html");
                InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
                BufferedReader bufferedReader = new BufferedReader(isr);
                String lineText = "";
                while((lineText=bufferedReader.readLine()) !=null ) {
                    content.append(lineText + "
    ");
                }
                bufferedReader.close();
    //            System.out.println(content);
                
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                
            }
            
            //创建正则表达式对象
            Pattern p = Pattern.compile("<li>([\u4e00-\u9fa5]{2,8})([a-zA-Z]+)</li>");
            //匹配正则表达式
            Matcher m = p.matcher(content);
            //查找匹配结构
            while(m.find()) {
                System.out.println(m.group(0));//打印全部
                System.out.println(m.group(1));//打印第一个分组
                System.out.println(m.group(2));//打印第二个分组
            }
    
        }
    
    }
  • 相关阅读:
    面向对象编程
    面向对象编程进阶
    pycharm常用快捷键
    面向对象
    深拷贝和浅拷贝
    hashlib模块
    日志配置
    常用模块大全
    正则详解
    软件目录规范
  • 原文地址:https://www.cnblogs.com/wuheng-123/p/13715123.html
Copyright © 2011-2022 走看看