zoukankan      html  css  js  c++  java
  • C#将汉字转换为拼音首字母

    关于这个话题以前曾经使用过一个简便的算法很长时间, 代码如下:
           private string ToPinyinSingle(string str)
            {
                if (str.CompareTo("") < 0)
                    return str;
                if (str.CompareTo("") < 0)
                    return "a";
                if (str.CompareTo("") < 0)
                    return "b";
                if (str.CompareTo("") < 0)
                    return "c";
                if (str.CompareTo("") < 0)
                    return "d";
                if (str.CompareTo("") < 0)
                    return "e";
                if (str.CompareTo("") < 0)
                    return "f";
                if (str.CompareTo("") < 0)
                    return "g";
                if (str.CompareTo("") < 0)
                    return "h";
                if (str.CompareTo("") < 0)
                    return "j";
                if (str.CompareTo("") < 0)
                    return "k";
                if (str.CompareTo("") < 0)
                    return "l";
                if (str.CompareTo("") < 0)
                    return "m";
                if (str.CompareTo("") < 0)
                    return "n";
                if (str.CompareTo("") < 0)
                    return "o";
                if (str.CompareTo("") < 0)
                    return "p";
                if (str.CompareTo("") < 0)
                    return "q";
                if (str.CompareTo("") < 0)
                    return "r";
                if (str.CompareTo("") < 0)
                    return "s";
                if (str.CompareTo("") < 0)
                    return "t";
                if (str.CompareTo("") < 0)
                    return "w";
                if (str.CompareTo("") < 0)
                    return "x";
                if (str.CompareTo("") < 0)
                    return "y";
                if (str.CompareTo("") < 0)
                    return "z";
                return str;

            } 

    这个函数只处理单个汉字, 简单地加个循环就可以让它处理文字串了.

    在.net 3.5下, 它一直工作得很好, 虽然偶尔也有出错的时候, 但是概率极低, 基本上可以忽略不计.

    然而后来我把项目升级到.net 4.0以后, 发现出错的几率直线上升, 已经高得无法容忍的程度了(例如, "梅" 会返回"L"), 简单查了一下, 没找到微软关于String.CompareTo函数有什么变化的说明, 束手无策, 于是换用另一个也很简单的算法(http://topic.csdn.net/u/20090219/12/61745e3a-a39e-4f4d-8985-67d124236694.html):

    static public string getSpell(string cn)
    {
    byte[] arrCN = System.Text.Encoding.Default.GetBytes(cn);
    if(arrCN.Length > 1)
    {
    int area = (short)arrCN[0];
    int pos = (short)arrCN[1];
    int code = (area<<8) + pos;
    int[] areacode = {45217,45253,45761,46318,46826,47010,47297,47614,48119,48119,49062,49324,49896,50371,50614,50622,50906,51387,51446,52218,52698,52698,52698,52980,53689,54481};
    for(int i=0;i<26;i++)
    {
    int max = 55290;
    if(i != 25) max = areacode[i+1];
    if(areacode[i]<=code && code<max)
    {
    return System.Text.Encoding.Default.GetString(new byte[]{(byte)(65+i)});
    }
    }
    return "?";
    }
    else return cn;

    } 

    但是这个函数出错的概率也很高, 例如"闫""窦""圳" 等都无法识别, 追查了一下原因, 发现原来对GB2312编码来说, 存放规定是这样的:

    01-09区为特殊符号。 

    16-55区为一级汉字,按拼音排序。 
    56-87区为二级汉字,按部首/笔画排序。
    每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”,第二个字节称为“低位字节”。
    “高位字节”使用了0xA1-0xF7(把01-87区的区号加上0xA0),“低位字节”使用了0xA1-0xFE(把01-94加上0xA0)。
    例如“啊”字在大多数程序中,会以0xB0A1储存。(与区位码对比:0xB0=0xA0+16,0xA1=0xA0+1)


    上述几个字位置码都大于55290, 显然是二级汉字, 这个算法就处理不了了, 换言之, 这种写法只能用于处理一级汉字. 这当然是不可接受的. 

    后来翻查良久, 终于找到一个用C++写的算法, 可以同时处理一级汉字和二级汉字(http://download.csdn.net/detail/ronjay/1955072), 我把它改写成了C#, 代码如下: 

            public class ChineseToPinYin
            {
                #region " 全局变量 "

                private static string[] _regionChar = new string[32]
                {
                    "CJWGNSPGCGNESYPBTYYZDXYKYGTDJNNJQMBSGZSCYJSYYQPGKBZGYCYWJKGKLJSWKPJQHYTWDDZLSGMRYPYWWCCKZNKYDG",
                    "TTNGJEYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJZXDTTSQZYCDXXHGCKBPHFFSSWYBGMXLPBYLLLHLX",
                    "SPZMYJHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZCDJZWQJBDZBXGZNZCPWHKXHQKMWFBPBY",
                    "DTJZZKQHYLYGXFPTYJYYZPSZLFCHMQSHGMXXSXJJSDCSBBQBEFSJYHWWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBN",
                    "PTSDKDYLHGYMYLCXPYCJNDQJWXQXFYYFJLEJBZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQTRBCJTHZTQFRXQHXMJJCJLX",
                    "QGJMSHZKBSWYEMYLTXFSYDSGLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCL",
                    "QKXLHXJRZJMFQHXHWYWSBHTRXXGLHQHFNMNYKLDYXZPWLGGTMTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZH",
                    "ZXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCS",
                    "YDBDLLSCDDNLKJYKJSYCJLKOHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPCJOQLCDHJJYSPRCHNWJNLHLYYQYYWZPTCZG",
                    "WWMZFFJQQQQYXACLBHKDJXDGMMYDJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYDCFCXYHLXCHYZJQSFQAGMNYXPFRKSSBJLYXY",
                    "SYGLNSCMHCWWMNZJJLXXHCHSYDSTTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLYSDCCWZOCWKCCSBNHCPDYZNFCYYTYCKX",
                    "KYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQHZPQSQSCFYMMDMGBWHWLGSSLYSDLMLXPTHMJ",
                    "HWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZPMGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCL",
                    "SLDCLRPBHZHXYYFHBBGDMYCNQQWLQHJJZYWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSLXHTZKZJECXJCJNMFBYCSFYWYB",
                    "JZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJXJYZGWK",
                    "YPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJBPMLMSXLZJQQHZYJCZYDJWBMJKLDDPMJEGXYHYLXHLQYQHKYCW",
                    "CJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHHJLJAXFGFJZSLCFDQZ",
                    "LCLGJDJCSNCLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNW",
                    "CZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTXYXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSB",
                    "GBMMCJSSCLPQPDXCDYYKYFCJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSS",
                    "LLYXQSXSBSJSBBSGGHFJLWPMZJNLYYWDQSHZXTYYWHMCYHYWDBXBTLMSYYYFSXJCSDXXLHJHFSSXZQHFZMZCZTQCXZXRTT",
                    "DJHNNYZQQMNQDMMGYYDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGSQQDDJCMBKZFFXMKDMDSYYSZCMLJDSYN",
                    "SPRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJJGYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTTSSXTBNPDKLEYCJNYCDYKZDDHQH",
                    "SDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKR",
                    "ZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLGBDJLSLYGKDZPZXJYYZLWCXSZFGWYYDLYHCLJS",
                    "CMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJYYCNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCGLZCSHLTOLJNMDDDLNGKAQHQH",
                    "JGYKHEZNMSHRPHQQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMM",
                    "MYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYJDJJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZKHHGFLBCSMDLDG",
                    "DZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYG",
                    "CTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZ",
                    "GSZZQJYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNDSZFDEQFHBS",
                    "AQTGLLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ"
                };
                private static System.Text.Encoding _encoding = System.Text.Encoding.GetEncoding("GB2312");

                #endregion

                private static bool In(int lp, int hp, int value)
                {
                    return ((value <= hp) && (value >= lp));
                }
                public static char GetFirstChar(string chineseChar)
                {
                    var bytes = _encoding.GetBytes(chineseChar);
                    if (bytes.Length != 2)
                        return chineseChar[0];
                    return GetChar(bytes[0], bytes[1], chineseChar);
                }
                private static char GetChar(byte c1, byte c2, string originChar)
                {
                    var Hi = c1 << 8;
                    var Lo = c2;
                    int n = Hi + Lo;
                    if (n <= 0xD7F9)
                    {
                        if (In(0xB0A10xB0C4, n)) return 'A';
                        if (In(0XB0C50XB2C0, n)) return 'B';
                        if (In(0xB2C10xB4ED, n)) return 'C';
                        if (In(0xB4EE0xB6E9, n)) return 'D';
                        if (In(0xB6EA0xB7A1, n)) return 'E';
                        if (In(0xB7A20xB8C0, n)) return 'F';
                        if (In(0xB8C10xB9FD, n)) return 'G';
                        if (In(0xB9FE0xBBF6, n)) return 'H';
                        if (In(0xBBF70xBFA5, n)) return 'J';
                        if (In(0xBFA60xC0AB, n)) return 'K';
                        if (In(0xC0AC0xC2E7, n)) return 'L';
                        if (In(0xC2E80xC4C2, n)) return 'M';
                        if (In(0xC4C30xC5B5, n)) return 'N';
                        if (In(0xC5B60xC5BD, n)) return 'O';
                        if (In(0xC5BE0xC6D9, n)) return 'P';
                        if (In(0xC6D10xC8BA, n)) return 'Q';
                        if (In(0xC8BB0xC8F5, n)) return 'R';
                        if (In(0xC8F60xCBF9, n)) return 'S';
                        if (In(0xCBFA0xCDD9, n)) return 'T';
                        if (In(0xCDDA0xCEF3, n)) return 'W';
                        if (In(0xCEF40xD1B8, n)) return 'X';
                        if (In(0xD1B90xD4D0, n)) return 'Y';
                        if (In(0xD4D10xD7F9, n)) return 'Z';
                        return originChar[0];
                    }
                    else
                    {
                        var b1 = (c1 & 0x7F) - 0x20 - 56;
                        var b2 = (c2 & 0x7F) - 0x20 - 1;
                        if (b1 >= 0 && b1 <= 31 && b2 >= 0 && b2 <= 93)
                        {
                            return _regionChar[b1][b2];
                        }
                        return originChar[0];
                    }
                }

            } 

     这个算法目前还没有发现哪个汉字会出错. 

    ---------------------------------------------

    作者:夏狼哉
    博客:http://www.cnblogs.com/Moosdau

    如需引用,敬请保留作者信息,谢谢

  • 相关阅读:
    HDU4628+状态压缩DP
    Javascript 去掉字符串前后空格的五种方法
    Javascript 数组之判断取值和数组取值
    ASP.NET MVC 出现错误 “The view 'XXX' or its master was not found or no view engine support”
    ASP.NET MVC 页面调整并传递参数
    ASP.NET MV3 部署网站 报"Could not load file or assembly ' System.Web.Helpers “ 错的解决方法
    ASP.NET MVC 控制器向View传值的三种方法
    CSharp 如何通过拼接XML调用存储过程来查询数据
    SQLServer : EXEC和sp_executesql的区别
    关于SQLServer2005的学习笔记—异常捕获及处理
  • 原文地址:https://www.cnblogs.com/Moosdau/p/2277727.html
Copyright © 2011-2022 走看看