zoukankan      html  css  js  c++  java
  • CHzSeg

     1 #ifndef _HZSEG_H_040415_
     2 #define _HZSEG_H_040415_
     3 
     4 #include <iostream>
     5 #include <string>
     6 #include <cstring>
     7 #include <cstdlib>
     8 #include <fstream>
     9 #include "Dict.h"
    10 
    11 using namespace std;
    12 
    13 class CHzSeg
    14 {
    15 public:
    16     CHzSeg();
    17     ~CHzSeg();
    18 
    19     string SegmentSentenceMM (CDict&, string) const;//只保留下中文
    20     string SegmentHzStrMM (CDict&, string) const;//切词
    21     string SegmentURL(CDict&, string) const;
    22 
    23     // process a sentence before segmentation
    24     void Translate(char* SourceStr) const;
    25 };
    26     
    27 #endif /* _HZSEG_H_040415_ */
      1 // HzSeg handling
      2 
      3 #include "HzSeg.h"
      4 #include "Dict.h"
      5 
      6 const unsigned int MAX_WORD_LENGTH = 8;
      7 const string SEPARATOR("/  "); // delimiter between words
      8 
      9 CHzSeg::CHzSeg() {
     10 }
     11 
     12 CHzSeg::~CHzSeg() {
     13 }
     14 
     15 // Using Max Matching method to segment a character string.
     16 string CHzSeg::SegmentHzStrMM(CDict &dict, string s1) const {
     17     string s2 = ""; //保存句子s1的分词结果
     18     while (!s1.empty()) {
     19         unsigned int len = s1.size();
     20 
     21         if (len > MAX_WORD_LENGTH)
     22             len = MAX_WORD_LENGTH;
     23         //如果待切分的句子大于最大切分单元
     24         //len=最大切分单元,否则len=句子的长度
     25 
     26         string w = s1.substr(0, len); //取s1句子最左边长度len为的子句子
     27         bool isw = dict.IsWord(w); //判断刚刚取出来的子句子是不是一个词
     28 
     29         while (len > 2 && isw == false) { //当w中至少有2个中文字&&不能构成字的时候,减去最右边的一个中文字符
     30             len -= 2; // cut a word
     31             w = w.substr(0, len);
     32             isw = dict.IsWord(w);
     33         }
     34         s2 += w + SEPARATOR;
     35 
     36         s1 = s1.substr(w.size());
     37     }
     38 
     39     return s2;
     40 }
     41 
     42 // process a sentence before segmentation
     43 string CHzSeg::SegmentSentenceMM(CDict &dict, string s1) const {
     44     string s2 = "";
     45     unsigned int i, len;
     46     cout << endl << "I'm in SegmentSentenceMM" << endl;
     47     cout << s1 << endl;
     48     while (!s1.empty()) {
     49         unsigned char ch = (unsigned char) s1[0];
     50         if (ch < 128) { //吃掉一行中所有换行符以外的英文字符
     51             i = 1;
     52             len = s1.size();
     53             while (i < len && ((unsigned char) s1[i] < 128) && (s1[i] != 10)
     54                     && (s1[i] != 13)) { // LF, CR
     55                 i++; //不是回车换行
     56             }
     57 
     58             if ((ch != 32) && (ch != 10) && (ch != 13)) { // SP, LF, CR
     59                 s2 += s1.substr(0, i) + SEPARATOR;
     60             } else {
     61                 if (ch == 10 || ch == 13) {
     62                     s2 += s1.substr(0, i);
     63                     cout << "当前s2:" << s2 << endl;
     64                 }
     65             }
     66 
     67             if (i <= s1.size()) {
     68                 s1 = s1.substr(i); //获得删去部分英文字符后的数据
     69             } else
     70                 break; // 处理完英文字符
     71 
     72             continue;
     73 
     74         } else {
     75             if (ch < 176) { //中文标点等非汉字字符128<=ch<176
     76                 i = 0;
     77                 len = s1.length();
     78 
     79                 while (i < len && ((unsigned char) s1[i] < 176)
     80                         && ((unsigned char) s1[i] >= 161)
     81                         && (!((unsigned char) s1[i] == 161
     82                                 && ((unsigned char) s1[i + 1] >= 162
     83                                         && (unsigned char) s1[i + 1] <= 168)))
     84                         && (!((unsigned char) s1[i] == 161
     85                                 && ((unsigned char) s1[i + 1] >= 171
     86                                         && (unsigned char) s1[i + 1] <= 191)))
     87                         && (!((unsigned char) s1[i] == 163
     88                                 && ((unsigned char) s1[i + 1] == 172
     89                                         || (unsigned char) s1[i + 1] == 161)
     90                                 || (unsigned char) s1[i + 1] == 168
     91                                 || (unsigned char) s1[i + 1] == 169
     92                                 || (unsigned char) s1[i + 1] == 186
     93                                 || (unsigned char) s1[i + 1] == 187
     94                                 || (unsigned char) s1[i + 1] == 191))) {
     95                     i = i + 2; //假定没有半个汉字
     96                 }
     97 
     98                 if (i == 0)
     99                     i = i + 2;
    100 
    101                 if (!(ch == 161 && (unsigned char) s1[1] == 161)) { // 不处理中文空格
    102                     if (i <= s1.size()) // yhf
    103                         s2 += s1.substr(0, i) + SEPARATOR; // 其他的非汉字双字节字符可能连续输出
    104                     else
    105                         break; // yhf
    106                 }
    107 
    108                 if (i <= s1.size()) {
    109                     s1 = s1.substr(i); //取s1从下标i开始的子字符串
    110 
    111                 } else
    112                     break; //yhf
    113 
    114                 continue;
    115             }
    116         }
    117 
    118         i = 2;
    119         len = s1.length();
    120         while (i < len && (unsigned char) s1[i] >= 176)
    121 //    while(i<len && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161)
    122             i += 2;
    123 
    124         s2 += SegmentHzStrMM(dict, s1.substr(0, i));
    125 
    126         if (i <= len) // yhf
    127             s1 = s1.substr(i);
    128         else
    129             break; // yhf
    130     }
    131 
    132     return s2;
    133 }
    134 
    135 // translate the encoded URL(%xx) to actual chars
    136 void CHzSeg::Translate(char* SourceStr) const {
    137     int i = 0;
    138     int j = 0;
    139     char *tempstr, tempchar1, tempchar2;
    140 
    141     tempstr = (char*) malloc(strlen(SourceStr) + 1);
    142     if (tempstr == NULL) {
    143         return;
    144     }
    145 
    146     while (SourceStr[j]) {
    147         if ((tempstr[i] = SourceStr[j]) == '%') {
    148             if (SourceStr[j + 1] >= 'A')
    149                 tempchar1 = ((SourceStr[j + 1] & 0xdf) - 'A') + 10;
    150             else
    151                 tempchar1 = (SourceStr[j + 1] - '0');
    152             if (SourceStr[j + 2] >= 'A')
    153                 tempchar2 = ((SourceStr[j + 2] & 0xdf) - 'A') + 10;
    154             else
    155                 tempchar2 = (SourceStr[j + 2] - '0');
    156             tempstr[i] = tempchar1 * 16 + tempchar2;
    157             j = j + 2;
    158         }
    159         i++;
    160         j++;
    161     }
    162     tempstr[i] = '\0';
    163     strcpy(SourceStr, tempstr);
    164 
    165     if (tempstr)
    166         free(tempstr);
    167 }
    168 
    169 /*
    170  * segment the image URL by '/'
    171  * omit the domain name
    172  */
    173 string CHzSeg::SegmentURL(CDict &dict, string url) const {
    174     string::size_type idx, nidx;
    175     char *curl = (char *) url.c_str();
    176     this->Translate(curl);
    177     url = curl;
    178     if ((idx = url.find("http://", 0)) != string::npos) {
    179         if ((nidx = url.find("/", 7)) != string::npos) {
    180             url = url.substr(nidx + 1); // cut the part of sitename
    181         }
    182     }
    183     idx = 0;
    184     while ((idx = url.find("/", idx)) != string::npos) {
    185         url.replace(idx, 1, SEPARATOR); // replace "/" with SEPARATOR "/  "
    186         idx += 3;
    187     }
    188     if ((idx = url.rfind(".")) != string::npos) {
    189         url = url.erase(idx); // erase the file extension
    190     }
    191 
    192     url += "/  ";
    193 
    194     // segment the string whose length is greater than 8 (4 HZ_chars)
    195     idx = 0;
    196     nidx = 0;
    197     bool isover = false;
    198     string stmp;
    199     while (!isover) {
    200         if ((nidx = url.find(SEPARATOR, idx)) == string::npos)
    201             isover = true;
    202         if (nidx - idx > 0) {
    203             stmp = url.substr(idx, nidx - idx);
    204             stmp = SegmentSentenceMM(dict, stmp);
    205             if (stmp.size() >= 3)
    206                 stmp.erase(stmp.length() - 3); // erase the tail "/  "
    207             url = url.replace(idx, nidx - idx, stmp);
    208             idx += stmp.length() + 3;
    209         } else if (nidx == string::npos && idx < url.length()) {
    210             stmp = url.substr(idx);
    211             stmp = SegmentSentenceMM(dict, stmp);
    212             stmp.erase(stmp.length() - 3);
    213             url = url.substr(0, idx) + stmp;
    214         } else
    215             idx = nidx + 3;
    216     }
    217 
    218     return url;
    219 
    220 }
  • 相关阅读:
    常见的MYSQL高可用解决方案
    CDN——到底用还是不用?
    Maven学习总结
    Git – Fast Forward 和 no fast foward
    Spring boot 打成jar包问题总结
    Spring Data JPA进阶——Specifications和Querydsl
    Arp攻击实战
    crontab命令
    mtr命令
    如何使用qperf来衡量网络带宽和延迟性能?
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2591824.html
Copyright © 2011-2022 走看看