zoukankan      html  css  js  c++  java
  • 常用工具类

      1 import java.io.*;
      2 import java.util.Collection;
      3 import java.util.Iterator;
      4 import java.util.List;
      5 
      6 /**
      7  * 文本工具类
      8  */
      9 public class TextUtility
     10 {
     11 
     12     /**
     13      * 单字节
     14      */
     15     public static final int CT_SINGLE = 5;// SINGLE byte
     16 
     17     /**
     18      * 分隔符"!,.?()[]{}+=
     19      */
     20     public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
     21 
     22     /**
     23      * 中文字符
     24      */
     25     public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
     26 
     27     /**
     28      * 字母
     29      */
     30     public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
     31 
     32     /**
     33      * 数字
     34      */
     35     public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
     36 
     37     /**
     38      * 序号
     39      */
     40     public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
     41 
     42     /**
     43      * 中文数字
     44      */
     45     public static final int CT_CNUM = CT_SINGLE + 6;
     46 
     47     /**
     48      * 其他
     49      */
     50     public static final int CT_OTHER = CT_SINGLE + 12;// Other
     51 
     52     public static int charType(char c)
     53     {
     54         return charType(String.valueOf(c));
     55     }
     56 
     57     /**
     58      * 判断字符类型
     59      * @param str
     60      * @return
     61      */
     62     public static int charType(String str)
     63     {
     64         if (str != null && str.length() > 0)
     65         {
     66             if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) return CT_CNUM;
     67             byte[] b;
     68             try
     69             {
     70                 b = str.getBytes("GBK");
     71             }
     72             catch (UnsupportedEncodingException e)
     73             {
     74                 b = str.getBytes();
     75                 e.printStackTrace();
     76             }
     77             byte b1 = b[0];
     78             byte b2 = b.length > 1 ? b[1] : 0;
     79             int ub1 = getUnsigned(b1);
     80             int ub2 = getUnsigned(b2);
     81             if (ub1 < 128)
     82             {
     83                 if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
     84                 if (' ' == b1) return CT_OTHER;
     85                 if ('
    ' == b1) return CT_DELIMITER;
     86                 if ("*"!,.?()[]{}+=/\;:|".indexOf((char) b1) != -1)
     87                     return CT_DELIMITER;
     88                 if ("0123456789".indexOf((char)b1) != -1)
     89                     return CT_NUM;
     90                 return CT_SINGLE;
     91             }
     92             else if (ub1 == 162)
     93                 return CT_INDEX;
     94             else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
     95                 return CT_NUM;
     96             else if (ub1 == 163
     97                     && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
     98                     && ub2 <= 250))
     99                 return CT_LETTER;
    100             else if (ub1 == 161 || ub1 == 163)
    101                 return CT_DELIMITER;
    102             else if (ub1 >= 176 && ub1 <= 247)
    103                 return CT_CHINESE;
    104 
    105         }
    106         return CT_OTHER;
    107     }
    108 
    109     /**
    110      * 是否全是中文
    111      * @param str
    112      * @return
    113      */
    114     public static boolean isAllChinese(String str)
    115     {
    116         return str.matches("[\u4E00-\u9FA5]+");
    117     }
    118     /**
    119      * 是否全部不是中文
    120      * @param sString
    121      * @return
    122      */
    123     public static boolean isAllNonChinese(byte[] sString)
    124     {
    125         int nLen = sString.length;
    126         int i = 0;
    127 
    128         while (i < nLen)
    129         {
    130             if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
    131                 return false;
    132             if (sString[i] < 0)
    133                 i += 2;
    134             else
    135                 i += 1;
    136         }
    137         return true;
    138     }
    139 
    140     /**
    141      * 是否全是单字节
    142      * @param str
    143      * @return
    144      */
    145     public static boolean isAllSingleByte(String str)
    146     {
    147         assert str != null;
    148         for (int i = 0; i < str.length(); i++)
    149         {
    150             if (str.charAt(i) >128)
    151             {
    152                 return false;
    153             }
    154         }
    155         return true;
    156     }
    157 
    158     /**
    159      * 把表示数字含义的字符串转成整形
    160      *
    161      * @param str 要转换的字符串
    162      * @return 如果是有意义的整数,则返回此整数值。否则,返回-1。
    163      */
    164     public static int cint(String str)
    165     {
    166         if (str != null)
    167             try
    168             {
    169                 int i = new Integer(str).intValue();
    170                 return i;
    171             }
    172             catch (NumberFormatException e)
    173             {
    174 
    175             }
    176 
    177         return -1;
    178     }
    179     /**
    180      * 是否全是数字
    181      * @param str
    182      * @return
    183      */
    184     public static boolean isAllNum(String str)
    185     {
    186         if (str == null)
    187             return false;
    188 
    189         int i = 0;
    190         /** 判断开头是否是+-之类的符号 */
    191         if ("±+-+-—".indexOf(str.charAt(0)) != -1)
    192             i++;
    193         /** 如果是全角的0123456789 字符* */
    194         while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
    195             i++;
    196         // Get middle delimiter such as .
    197         if (i > 0 && i < str.length())
    198         {
    199             char ch = str.charAt(i);
    200             if ("·∶:,,..//".indexOf(ch) != -1)
    201             {// 98.1%
    202                 i++;
    203                 while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
    204                     i++;
    205             }
    206         }
    207         if (i >= str.length())
    208             return true;
    209 
    210         /** 如果是半角的0123456789字符* */
    211         while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
    212             i++;
    213         // Get middle delimiter such as .
    214         if (i > 0 && i < str.length())
    215         {
    216             char ch = str.charAt(i);
    217             if (',' == ch || '.' == ch || '/' == ch  || ':' == ch || "∶·,./".indexOf(ch) != -1)
    218             {// 98.1%
    219                 i++;
    220                 while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
    221                     i++;
    222             }
    223         }
    224 
    225         if (i < str.length())
    226         {
    227             if ("百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1)
    228                 i++;
    229         }
    230         if (i >= str.length())
    231             return true;
    232 
    233         return false;
    234     }
    235 
    236     /**
    237      * 是否全是序号
    238      * @param sString
    239      * @return
    240      */
    241     public static boolean isAllIndex(byte[] sString)
    242     {
    243         int nLen = sString.length;
    244         int i = 0;
    245 
    246         while (i < nLen - 1 && getUnsigned(sString[i]) == 162)
    247         {
    248             i += 2;
    249         }
    250         if (i >= nLen)
    251             return true;
    252         while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
    253                 || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1))
    254         {// single
    255             // byte
    256             // number
    257             // char
    258             i += 1;
    259         }
    260 
    261         if (i < nLen)
    262             return false;
    263         return true;
    264 
    265     }
    266 
    267     /**
    268      * 是否全为英文
    269      *
    270      * @param text
    271      * @return
    272      */
    273     public static boolean isAllLetter(String text)
    274     {
    275         for (int i = 0; i < text.length(); ++i)
    276         {
    277             char c = text.charAt(i);
    278             if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z'))))
    279             {
    280                 return false;
    281             }
    282         }
    283 
    284         return true;
    285     }
    286 
    287     /**
    288      * 是否全为英文或字母
    289      *
    290      * @param text
    291      * @return
    292      */
    293     public static boolean isAllLetterOrNum(String text)
    294     {
    295         for (int i = 0; i < text.length(); ++i)
    296         {
    297             char c = text.charAt(i);
    298             if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9'))))
    299             {
    300                 return false;
    301             }
    302         }
    303 
    304         return true;
    305     }
    306 
    307     /**
    308      * 是否全是分隔符
    309      * @param sString
    310      * @return
    311      */
    312     public static boolean isAllDelimiter(byte[] sString)
    313     {
    314         int nLen = sString.length;
    315         int i = 0;
    316 
    317         while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163))
    318         {
    319             i += 2;
    320         }
    321         if (i < nLen)
    322             return false;
    323         return true;
    324     }
    325 
    326     /**
    327      * 是否全是中国数字
    328      * @param word
    329      * @return
    330      */
    331     public static boolean isAllChineseNum(String word)
    332     {// 百分之五点六的人早上八点十八分起床
    333 
    334         String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
    335         String prefix = "几数上第";
    336         String surfix = "几多余来成倍";
    337         boolean round = false;
    338 
    339         if (word == null)
    340             return false;
    341 
    342         char[] temp = word.toCharArray();
    343         for (int i = 0; i < temp.length; i++)
    344         {
    345             if (word.startsWith("分之", i))// 百分之五
    346             {
    347                 i += 1;
    348                 continue;
    349             }
    350             char tchar = temp[i];
    351             if (i == 0 && prefix.indexOf(tchar) != -1)
    352             {
    353                 round = true;
    354             }
    355             else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1)
    356             {
    357                 round = true;
    358             }
    359             else if (chineseNum.indexOf(tchar) == -1)
    360                 return false;
    361         }
    362         return true;
    363     }
    364 
    365 
    366     /**
    367      * 得到字符集的字符在字符串中出现的次数
    368      *
    369      * @param charSet
    370      * @param word
    371      * @return
    372      */
    373     public static int getCharCount(String charSet, String word)
    374     {
    375         int nCount = 0;
    376 
    377         if (word != null)
    378         {
    379             String temp = word + " ";
    380             for (int i = 0; i < word.length(); i++)
    381             {
    382                 String s = temp.substring(i, i + 1);
    383                 if (charSet.indexOf(s) != -1)
    384                     nCount++;
    385             }
    386         }
    387 
    388         return nCount;
    389     }
    390 
    391 
    392     /**
    393      * 获取字节对应的无符号整型数
    394      *
    395      * @param b
    396      * @return
    397      */
    398     public static int getUnsigned(byte b)
    399     {
    400         if (b > 0)
    401             return (int) b;
    402         else
    403             return (b & 0x7F + 128);
    404     }
    405 
    406     /**
    407      * 判断字符串是否是年份
    408      *
    409      * @param snum
    410      * @return
    411      */
    412     public static boolean isYearTime(String snum)
    413     {
    414         if (snum != null)
    415         {
    416             int len = snum.length();
    417             String first = snum.substring(0, 1);
    418 
    419             // 1992年, 98年,06年
    420             if (isAllSingleByte(snum)
    421                     && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0)))
    422                 return true;
    423             if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1))
    424                 return true;
    425             if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2)
    426                 return true;
    427             if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年
    428                 return true;
    429             if (len == 1 && getCharCount("千仟", snum) == 1)
    430                 return true;
    431             if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1
    432                     && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1)
    433                 return true;
    434         }
    435         return false;
    436     }
    437 
    438     /**
    439      * 判断一个字符串的所有字符是否在另一个字符串集合中
    440      *
    441      * @param aggr 字符串集合
    442      * @param str  需要判断的字符串
    443      * @return
    444      */
    445     public static boolean isInAggregate(String aggr, String str)
    446     {
    447         if (aggr != null && str != null)
    448         {
    449             str += "1";
    450             for (int i = 0; i < str.length(); i++)
    451             {
    452                 String s = str.substring(i, i + 1);
    453                 if (aggr.indexOf(s) == -1)
    454                     return false;
    455             }
    456             return true;
    457         }
    458 
    459         return false;
    460     }
    461 
    462     /**
    463      * 判断该字符串是否是半角字符
    464      *
    465      * @param str
    466      * @return
    467      */
    468     public static boolean isDBCCase(String str)
    469     {
    470         if (str != null)
    471         {
    472             str += " ";
    473             for (int i = 0; i < str.length(); i++)
    474             {
    475                 String s = str.substring(i, i + 1);
    476                 int length = 0;
    477                 try
    478                 {
    479                     length = s.getBytes("GBK").length;
    480                 }
    481                 catch (UnsupportedEncodingException e)
    482                 {
    483                     e.printStackTrace();
    484                     length = s.getBytes().length;
    485                 }
    486                 if (length != 1)
    487                     return false;
    488             }
    489 
    490             return true;
    491         }
    492 
    493         return false;
    494     }
    495 
    496     /**
    497      * 判断该字符串是否是全角字符
    498      *
    499      * @param str
    500      * @return
    501      */
    502     public static boolean isSBCCase(String str)
    503     {
    504         if (str != null)
    505         {
    506             str += " ";
    507             for (int i = 0; i < str.length(); i++)
    508             {
    509                 String s = str.substring(i, i + 1);
    510                 int length = 0;
    511                 try
    512                 {
    513                     length = s.getBytes("GBK").length;
    514                 }
    515                 catch (UnsupportedEncodingException e)
    516                 {
    517                     e.printStackTrace();
    518                     length = s.getBytes().length;
    519                 }
    520                 if (length != 2)
    521                     return false;
    522             }
    523 
    524             return true;
    525         }
    526 
    527         return false;
    528     }
    529 
    530     /**
    531      * 判断是否是一个连字符(分隔符)
    532      *
    533      * @param str
    534      * @return
    535      */
    536     public static boolean isDelimiter(String str)
    537     {
    538         if (str != null && ("-".equals(str) || "-".equals(str)))
    539             return true;
    540         else
    541             return false;
    542     }
    543 
    544     public static boolean isUnknownWord(String word)
    545     {
    546         if (word != null && word.indexOf("未##") == 0)
    547             return true;
    548         else
    549             return false;
    550     }
    551 
    552     /**
    553      * 防止频率为0发生除零错误
    554      *
    555      * @param frequency
    556      * @return
    557      */
    558     public static double nonZero(double frequency)
    559     {
    560         if (frequency == 0) return 1e-3;
    561 
    562         return frequency;
    563     }
    564 
    565     /**
    566      * 转换long型为char数组
    567      *
    568      * @param x
    569      */
    570     public static char[] long2char(long x)
    571     {
    572         char[] c = new char[4];
    573         c[0] = (char) (x >> 48);
    574         c[1] = (char) (x >> 32);
    575         c[2] = (char) (x >> 16);
    576         c[3] = (char) (x);
    577         return c;
    578     }
    579 
    580     /**
    581      * 转换long类型为string
    582      *
    583      * @param x
    584      * @return
    585      */
    586     public static String long2String(long x)
    587     {
    588         char[] cArray = long2char(x);
    589         StringBuilder sbResult = new StringBuilder(cArray.length);
    590         for (char c : cArray)
    591         {
    592             sbResult.append(c);
    593         }
    594         return sbResult.toString();
    595     }
    596 
    597     /**
    598      * 将异常转为字符串
    599      *
    600      * @param e
    601      * @return
    602      */
    603     public static String exceptionToString(Exception e)
    604     {
    605         StringWriter sw = new StringWriter();
    606         PrintWriter pw = new PrintWriter(sw);
    607         e.printStackTrace(pw);
    608         return sw.toString();
    609     }
    610 
    611     /**
    612      * 判断某个字符是否为汉字
    613      *
    614      * @param c 需要判断的字符
    615      * @return 是汉字返回true,否则返回false
    616      */
    617     public static boolean isChinese(char c)
    618     {
    619         String regex = "[\u4e00-\u9fa5]";
    620         return String.valueOf(c).matches(regex);
    621     }
    622 
    623     /**
    624      * 统计 keyword 在 srcText 中的出现次数
    625      *
    626      * @param keyword
    627      * @param srcText
    628      * @return
    629      */
    630     public static int count(String keyword, String srcText)
    631     {
    632         int count = 0;
    633         int leng = srcText.length();
    634         int j = 0;
    635         for (int i = 0; i < leng; i++)
    636         {
    637             if (srcText.charAt(i) == keyword.charAt(j))
    638             {
    639                 j++;
    640                 if (j == keyword.length())
    641                 {
    642                     count++;
    643                     j = 0;
    644                 }
    645             }
    646             else
    647             {
    648                 i = i - j;// should rollback when not match
    649                 j = 0;
    650             }
    651         }
    652 
    653         return count;
    654     }
    655 
    656     /**
    657      * 简单好用的写String方式
    658      *
    659      * @param s
    660      * @param out
    661      * @throws IOException
    662      */
    663     public static void writeString(String s, DataOutputStream out) throws IOException
    664     {
    665         out.writeInt(s.length());
    666         for (char c : s.toCharArray())
    667         {
    668             out.writeChar(c);
    669         }
    670     }
    671 
    672     /**
    673      * 判断字符串是否为空(null和空格)
    674      *
    675      * @param cs
    676      * @return
    677      */
    678     public static boolean isBlank(CharSequence cs)
    679     {
    680         int strLen;
    681         if (cs == null || (strLen = cs.length()) == 0)
    682         {
    683             return true;
    684         }
    685         for (int i = 0; i < strLen; i++)
    686         {
    687             if (!Character.isWhitespace(cs.charAt(i)))
    688             {
    689                 return false;
    690             }
    691         }
    692         return true;
    693     }
    694 
    695     public static String join(String delimiter, Collection<String> stringCollection)
    696     {
    697         StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length()));
    698         for (String str : stringCollection)
    699         {
    700             sb.append(str).append(delimiter);
    701         }
    702 
    703         return sb.toString();
    704     }
    705 
    706     public static String combine(String... termArray)
    707     {
    708         StringBuilder sbSentence = new StringBuilder();
    709         for (String word : termArray)
    710         {
    711             sbSentence.append(word);
    712         }
    713         return sbSentence.toString();
    714     }
    715 
    716     public static String join(Iterable<? extends CharSequence> s, String delimiter)
    717     {
    718         Iterator<? extends CharSequence> iter = s.iterator();
    719         if (!iter.hasNext()) return "";
    720         StringBuilder buffer = new StringBuilder(iter.next());
    721         while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
    722         return buffer.toString();
    723     }
    724 
    725     public static String combine(Sentence sentence)
    726     {
    727         StringBuilder sb = new StringBuilder(sentence.wordList.size() * 3);
    728         for (IWord word : sentence.wordList)
    729         {
    730             sb.append(word.getValue());
    731         }
    732 
    733         return sb.toString();
    734     }
    735 
    736     public static String combine(List<Word> wordList)
    737     {
    738         StringBuilder sb = new StringBuilder(wordList.size() * 3);
    739         for (IWord word : wordList)
    740         {
    741             sb.append(word.getValue());
    742         }
    743 
    744         return sb.toString();
    745     }
    746 }

    来源:https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java

  • 相关阅读:
    POJ 3710 Christmas Game#经典图SG博弈
    POJ 2599 A funny game#树形SG(DFS实现)
    POJ 2425 A Chess Game#树形SG
    LeetCode Array Easy 122. Best Time to Buy and Sell Stock II
    LeetCode Array Easy121. Best Time to Buy and Sell Stock
    LeetCode Array Easy 119. Pascal's Triangle II
    LeetCode Array Easy 118. Pascal's Triangle
    LeetCode Array Easy 88. Merge Sorted Array
    ASP.NET MVC 学习笔记之 MVC + EF中的EO DTO ViewModel
    ASP.NET MVC 学习笔记之面向切面编程与过滤器
  • 原文地址:https://www.cnblogs.com/lxcy/p/9290475.html
Copyright © 2011-2022 走看看