zoukankan      html  css  js  c++  java
  • 一些中文相关的操作方法

    package com.opslab.util;


    import net.sourceforge.pinyin4j.PinyinHelper;
    import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
    import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
    import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
    import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
    import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    /**
    * 一些中文相关的操作方法
    */
    public final class ChinesUtil {
    private ChinesUtil(){

    }
    /**
    * 将字符串中的中文转化为拼音,其他字符不变
    *
    * @param inputString
    * @return
    */
    public final static String getPingYin(String inputString) {
    HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
    format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
    format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    format.setVCharType(HanyuPinyinVCharType.WITH_V);

    char[] input = inputString.trim().toCharArray();
    String output = "";

    try {
    for (int i = 0; i < input.length; i++) {
    if (java.lang.Character.toString(input[i]).matches("[\u4E00-\u9FA5]+")) {
    String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format);
    output += temp[0];
    } else
    output += java.lang.Character.toString(input[i]);
    }
    } catch (BadHanyuPinyinOutputFormatCombination e) {
    e.printStackTrace();
    }
    return output;
    }

    /**
    * 获取汉字串拼音首字母,英文字符不变
    *
    * @param chinese 汉字串
    * @return 汉语拼音首字母
    */
    public final static String getFirstSpell(String chinese) {
    StringBuffer pybf = new StringBuffer();
    char[] arr = chinese.toCharArray();
    HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
    defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
    defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    for (int i = 0; i < arr.length; i++) {
    if (arr[i] > 128) {
    try {
    String[] temp = PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat);
    if (temp != null) {
    pybf.append(temp[0].charAt(0));
    }
    } catch (BadHanyuPinyinOutputFormatCombination e) {
    e.printStackTrace();
    }
    } else {
    pybf.append(arr[i]);
    }
    }
    return pybf.toString().replaceAll("\W", "").trim();
    }

    /**
    * 获取汉字串拼音,英文字符不变
    *
    * @param chinese 汉字串
    * @return 汉语拼音
    */
    public final static String getFullSpell(String chinese) {
    StringBuffer pybf = new StringBuffer();
    char[] arr = chinese.toCharArray();
    HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
    defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
    defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    for (int i = 0; i < arr.length; i++) {
    if (arr[i] > 128) {
    try {
    pybf.append(PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat)[0]);
    } catch (BadHanyuPinyinOutputFormatCombination e) {
    e.printStackTrace();
    }
    } else {
    pybf.append(arr[i]);
    }
    }
    return pybf.toString();
    }


    // 只能判断部分CJK字符(CJK统一汉字)
    public final static boolean isChineseByREG(String str) {
    if (str == null) {
    return false;
    }
    Pattern pattern = Pattern.compile("[\u4E00-\u9FBF]+");
    return pattern.matcher(str.trim()).find();
    }

    // 只能判断部分CJK字符(CJK统一汉字)
    public final static boolean isChineseByName(String str) {
    if (str == null) {
    return false;
    }
    // 大小写不同:\p 表示包含,\P 表示不包含
    // \p{Cn} 的意思为 Unicode 中未被定义字符的编码,\P{Cn} 就表示 Unicode中已经被定义字符的编码
    String reg = "\p{InCJK Unified Ideographs}&&\P{Cn}";
    Pattern pattern = Pattern.compile(reg);
    return pattern.matcher(str.trim()).find();
    }


    // 完整的判断中文汉字和符号
    public final static boolean isChinese(String strName) {
    char[] ch = strName.toCharArray();
    for (int i = 0; i < ch.length; i++) {
    char c = ch[i];
    if (isChinese(c)) {
    return true;
    }
    }
    return false;
    }

    /**
    * 判断是否是中文
    *
    * @param c
    * @return
    */
    public final static boolean isChinese(char c) {
    Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
    if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
    return true;
    }
    return false;
    }

    /**
    * 获取一个字符串中中文字符的个数
    */
    public final static int ChineseLength(String str) {
    Pattern p = Pattern.compile("[u4E00-u9FA5]+");
    Matcher m = p.matcher(str);
    int i = 0;
    while (m.find()) {
    String temp = m.group(0);
    i += temp.length();
    }
    return i;
    }

    /**
    * 判断是否是乱码
    *
    * @param strName
    * @return
    */
    public final static boolean isMessyCode(String strName) {
    Pattern p = Pattern.compile("\s*| *| *| *");
    Matcher m = p.matcher(strName);
    String after = m.replaceAll("");
    String temp = after.replaceAll("\p{P}", "");
    char[] ch = temp.trim().toCharArray();
    float chLength = 0;
    float count = 0;
    for (int i = 0; i < ch.length; i++) {
    char c = ch[i];
    if (!Character.isLetterOrDigit(c)) {
    if (!ChinesUtil.isChinese(c)) {
    count = count + 1;
    }
    chLength++;
    }
    }
    float result = count / chLength;
    if (result > 0.4) {
    return true;
    } else {
    return false;
    }
    }
    }

  • 相关阅读:
    初学mongodb笔记
    git学习【转载】
    json序列化与反序列化
    ES6函数比对ES5函数
    前端js脚本与防止js脚本
    js函数整合队列顺序执行插件
    padding-使用必记
    css小技巧
    三分钟教会你开密码箱
    百度地图Marker优化方案
  • 原文地址:https://www.cnblogs.com/chinaifae/p/10254805.html
Copyright © 2011-2022 走看看