zoukankan html css js c++ java

第十三章字符串正则匹配Pattern与Matcher

思考：为什么使用正则表达式之前都要经过编译？

思考：正则匹配又是怎么去匹配的？

从算法上分析，对一个字符串进行正则匹配，是拿正则表达式深度遍历整个字符串，也就是用正则表达式去匹配所有可能的子串【也不一定是所有，但肯定是绝大部分】，拿下面的这个例子来说

package 字符串;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * find()可以在输入的任意位置定位正则表达式
 * 而lookingAt()和matches()只有在正则表达式与输入的最开始处就开始匹配时才会成功
 * matches()只有在整个输入都匹配正则表达式时才会成功，而lookingAt()只要输入的第一部分匹配就会成功
 *
 * 找出所有不以大写字母开头的词，不重复的计算其个数
 * @author admin
 *
 */
public class TestMatcher {

    public static void main(String[] args) {
        String s = "Twas brilling,and  the slithy toves
" +
        "Did gyre and gimble in the wabe.
";
        System.out.println(s.length());
        Matcher m = Pattern.compile("\w+").matcher(s);
        System.out.println(m.matches());
    }

}

代码中正则表达式为"\w+" ，表示匹配一个或多个词字符[a-zA-Z0-9]，如果让这个正则表达式去匹配字符串s，【用matches()方法】算法的一般思路就是：

遍历字符串s所有字符，从首字母T开始和这个正则表达式匹配一次，如果匹配继续拿Tw和这个正则表达式匹配一次，如果匹配继续拿Twa和正则表达式匹配一次，就这样一直匹配下去。但是一旦出现某个子串与正则表达式不匹配，就结束程序返回false，上例中可见到Twas 时已经不在满足匹配要求，因为出现了空格。所以匹配过程会在第一次出现不满足正则表达式时就结束返回false。上例程序输出结果为：

false

推荐一篇分析正则表达式的博客（想深入了解的同学可以看看）：在这里

从上面分析我们就可以看出为什么使用正则表达式之前都要对正则表达式进行预编译了，因为正则匹配是不断的拿正则表达式去匹配比较字符子串，如果不预编译一次的话，每次匹配比较都要编译一次，显然是没必要的，如果你查看过compile()方法的源码会发现编译过程还是比较复杂和消耗性能的。

String类自带了一个matches方法，对正则需求不高时，直接使用字符串.matches("正则表达式")即可满足需求：

package 字符串;

/**
 * find()可以在输入的任意位置定位正则表达式
 * 而lookingAt()和matches()只有在正则表达式与输入的最开始处就开始匹配时才会成功
 * 
 * @author admin
 *
 */
public class Regex {

    public static void main(String[] args) {
        String str = "Java now has regular expressions";
        System.out.println(str.matches("^Java.*"));
    }

}

Matcher.find()方法：可以用来在CharSequence中查找更多个匹配。

package 字符串;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * find()可以在输入的任意位置定位正则表达式
 * 而lookingAt()和matches()只有在正则表达式与输入的最开始处就开始匹配时才会成功。
 * matches只有在整个输入都匹配正则表达式时才会成功,而lookingAt()只要输入的第一部分匹配就会成功
 * 找出所有不以大写字母开头的词，不重复的计算其个数
 * @author admin
 *
 */
public class TestMatcher {

    public static void main(String[] args) {
        Set<String> set = new HashSet<String>();
        String s = "Twas brilling,and  the slithy toves
" +
        "Did gyre and gimble in the wabe.
";
        System.out.println(s.length());
        Matcher m = Pattern.compile("\w+").matcher(s);
        System.out.println(m.matches());
        
        while(m.find())    //试图从输入字符串中根据正则去寻找下一个子串。
            if(!m.group().matches("[A-Z].*")) {
                //只有在调用过m.find()之后才能使用start/end
                //regionStart和regionEnd分别代表着字符串的起始位置
                System.out.println(m.start()+"=="+m.end()+"=="+m.regionStart()+"=="+m.regionEnd()+"=="+m.group());
                //m.group就是截取start---end位置的子串
                set.add(m.group());
            }
        System.out.println(set.size());
    }

}

控制台：

69
false
5==13==0==69==brilling
14==17==0==69==and
19==22==0==69==the
23==29==0==69==slithy
30==35==0==69==toves
40==44==0==69==gyre
45==48==0==69==and
49==55==0==69==gimble
56==58==0==69==in
59==62==0==69==the
63==67==0==69==wabe
9

每执行一次while循环都切出一块满足正则匹配的字符串。

Matcher.find(int i):其中i表示字符串中的位置，并以其作为搜索的起点。

package 字符串;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * find()可以在输入的任意位置定位正则表达式
 * 而lookingAt()和matches()只有在正则表达式与输入的最开始处就开始匹配时才会成功
 * 
 * 找出所有不以大写字母开头的词，不重复的计算其个数
 * @author admin
 *
 */
public class TestMatcher {

    public static void main(String[] args) {
        String s = "Twas brilling,and  the slithy toves
";
        System.out.println(s.length());
        Matcher m = Pattern.compile("\w+").matcher(s);

        int i = 0;
        while(m.find(i)) {//试图从输入字符串中根据正则去寻找下一个子串。
            System.out.println(m.start()+"=="+m.end()+"=="+m.regionStart()+"=="+m.regionEnd()+"=="+m.group());
            i++;
        }    
    }

}

控制台：

36
0==4==0==36==Twas
1==4==0==36==was
2==4==0==36==as
3==4==0==36==s
5==13==0==36==brilling
5==13==0==36==brilling
6==13==0==36==rilling
7==13==0==36==illing
8==13==0==36==lling
9==13==0==36==ling
10==13==0==36==ing
11==13==0==36==ng
12==13==0==36==g
14==17==0==36==and
14==17==0==36==and
15==17==0==36==nd
16==17==0==36==d
19==22==0==36==the
19==22==0==36==the
19==22==0==36==the
20==22==0==36==he
21==22==0==36==e
23==29==0==36==slithy
23==29==0==36==slithy
24==29==0==36==lithy
25==29==0==36==ithy
26==29==0==36==thy
27==29==0==36==hy
28==29==0==36==y
30==35==0==36==toves
30==35==0==36==toves
31==35==0==36==oves
32==35==0==36==ves
33==35==0==36==es
34==35==0==36==s

搜索的起点不断发生改变，每次从搜索起点位置开始去切出（group）那些满足正则条件的子串。

Matcher.lookingAt()只要输入的第一部分匹配就会成功【从头开始寻找，只要能找到一个满足正则匹配的子串就返回true】：

package 字符串;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * find()可以在输入的任意位置定位正则表达式
 * 而lookingAt()和matches()只有在正则表达式与输入的最开始处就开始匹配时才会成功
 * matches只有在整个输入都匹配正则表达式时才会成功,
 * 而lookingAt()只要输入的第一部分匹配就会成功
 * 
 * 找出所有不以大写字母开头的词，不重复的计算其个数
 * @author admin
 *
 */
public class TestMatcher {

    public static void main(String[] args) {
        String s1 = "Twas brilling,and  the slithy toves
";
        String s2 = " Twas brilling,and  the slithy toves
";
        String s3 = "T;was brilling,and  the slithy toves
";
        Pattern pattern = Pattern.compile("\w+");
        Matcher m1 = pattern.matcher(s1);
        Matcher m2= pattern.matcher(s2);
        Matcher m3= pattern.matcher(s3);
        
        System.out.println(m1.lookingAt()+"="+m1.start()+"="+m1.end()+"="+m1.group());
        System.out.println(m2.lookingAt());
        //没找到，当调用start/end/group时会抛出异常
        //System.out.println(m2.start()+"="+m2.end()+"="+m2.group());
        System.out.println(m3.lookingAt()+"="+m3.start()+"="+m3.end()+"="+m3.group());;
    }

}

控制台：

true=0=4=Twas
false
true=0=1=T

从头开始寻找，只有找到一个满足正则匹配的表达式就返回true，s1找到的字符串为Twas，s2首先第一个字符就不满足正则匹配，直接返回false，s3找到的字符串为T

前进时，请别遗忘了身后的脚印。

查看全文

相关阅读:
编写安全有效的 C# 代码
 模式匹配
 C#新特性
 转 C# .NET4.0 混合模式程序集异常
 win7 64位系统注册 ocx控件
 TIFF图像文件格式详解
 GDALOGR读取数据示例 C#版本
 使用gdal C#封装库读取DEM数据
 编译C#环境下GDAL（支持HDF4、NetCDF）
UML类图符号各种关系说明以及举例

原文地址：https://www.cnblogs.com/liudaihuablogs/p/9296465.html

热门文章
Pipelines
集合
 内存和跨度相关类型
 数据流
 Task
LINQ
弱引用
 清理未托管资源
 正则表达式
 数值格式化

第十三章 字符串 正则匹配Pattern与Matcher

第十三章字符串正则匹配Pattern与Matcher