package com.imooc.regex; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStreamReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexSample { public static void main(String[] args) { StringBuilder content = new StringBuilder(); try { FileInputStream fis = new FileInputStream("D:\eclipse-workspace\regex\WebContent\sample.html"); InputStreamReader isr = new InputStreamReader(fis,"UTF-8"); BufferedReader bufferedReader = new BufferedReader(isr); String lineText = ""; while((lineText=bufferedReader.readLine()) !=null ) { content.append(lineText + " "); } bufferedReader.close(); // System.out.println(content); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } //创建正则表达式对象 Pattern p = Pattern.compile("<li>([\u4e00-\u9fa5]{2,8})([a-zA-Z]+)</li>"); //匹配正则表达式 Matcher m = p.matcher(content); //查找匹配结构 while(m.find()) { System.out.println(m.group(0));//打印全部 System.out.println(m.group(1));//打印第一个分组 System.out.println(m.group(2));//打印第二个分组 } } }