zoukankan      html  css  js  c++  java
  • C++ 简单中文敏感词检测工具类

    具体思路:

    1->敏感词库,可从数据库读取,也可以从文件加载.

    2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.

    3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.

    4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.

    代码.h

     1 #ifndef SENSITIVE_WORDS_CHECKER_
     2 #define SENSITIVE_WORDS_CHECKER_
     3 #include <stdint.h>
     4 #include <stdio.h>
     5 #include <memory.h>
     6 #include <map>
     7 #include <vector>
     8 
     9 enum {
    10     enmMaxWordLength = 32,    //每个敏感词最大长度
    11     enmMaxWordsFileLength = 1024 * 128,    //敏感词文件最大长度128k
    12     enmMaxContentLength = 1024,    // 单次检测内容测最大长度
    13 };
    14 
    15 struct SensitiveWord
    16 {
    17     char szWord[enmMaxWordLength];
    18     SensitiveWord()
    19     {
    20         memset(szWord, 0, enmMaxWordLength);
    21     }
    22 };
    23 
    24 typedef std::vector<SensitiveWord*> WordList;
    25 typedef std::map<uint32_t, WordList*> WordMap;
    26 
    27 class SensitiveWordsChecker
    28 {
    29 public:
    30     SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){}
    31     ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; }
    32 public:
    33     void LoadWordsFromUTF8File(const char *file_name);
    34     void LoadWordsFromGBKFile(const char *file_name);
    35 protected:
    36     int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name);
    37     void DumpWordMap();
    38     void GenTestData();
    39     void Test();
    40     void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...);
    41 private:
    42     int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name);
    43     int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen);
    44     int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
    45     int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
    46     uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator);
    47     char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list);
    48     int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator);
    49     void BuildWordMap();
    50     uint32_t GetFirstCharFromGBK(char gbk_buf[]);
    51     uint32_t GetFirstCharFromTUF8(char utf8_buf[]);
    52     uint32_t GetFirstChar(char buf[]);
    53     // 返回 0 表示in_utf8_buf里面没有敏感词
    54     // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf
    55     int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]);
    56     const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos);
    57 private:
    58     SensitiveWord *arrSensitiveWord;
    59     uint32_t nSensitiveWordCnt;
    60     WordMap mapWords;
    61 };
    62 
    63 #endif
    View Code

    .cpp

      1 #include "SenditiveWordsChecker.h"
      2 #include "stdio.h"
      3 #include "string.h"
      4 #include "iconv.h"
      5 #include <stdarg.h>
      6 #include <new>
      7 
      8 void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name)
      9 {
     10     char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength];
     11     LoadFile(utf8_buf, enmMaxWordsFileLength, file_name);
     12     UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength);
     13     GetWords(gbk_buf, enmMaxWordsFileLength, ',');
     14 }
     15 
     16 void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name)
     17 {
     18     char gbk_buf[enmMaxWordsFileLength];
     19     LoadFile(gbk_buf, enmMaxWordsFileLength, file_name);
     20     GetWords(gbk_buf, enmMaxWordsFileLength,',');
     21 }
     22 
     23 int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name)
     24 {
     25     FILE * pFile;
     26     size_t lSize = 0, result = 0;
     27     fopen_s(&pFile, file_name, "rb");
     28     if (pFile == NULL) { fputs("File error
    ", stderr); return -1; }
     29     // obtain file size:
     30     fseek(pFile, 0, SEEK_END);
     31     lSize = ftell(pFile);
     32     rewind(pFile);
     33     if (lSize >= buf_size){ fputs("file too large
    ", stderr); return -1; }
     34     result = fread(buf, 1, lSize, pFile);
     35     if (result != lSize) { fputs("Reading error
    ", stderr); return -1; }
     36     buf[lSize] = '';
     37     return fclose(pFile);
     38 }
     39 
     40 int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen)
     41 {
     42     iconv_t cd;
     43     char **pin = &inbuf;
     44     char **pout = &outbuf;
     45 
     46     cd = iconv_open(to_charset, from_charset);
     47     if (cd == 0)
     48         return -1;
     49     memset(outbuf, 0, outlen);
     50     if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
     51         return -1;
     52     iconv_close(cd);
     53     *pout = '';
     54     return 0;
     55 }
     56 
     57 int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
     58 {
     59     return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen);
     60 }
     61 
     62 int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
     63 {
     64     return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen);
     65 }
     66 
     67 uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator)
     68 {
     69     const char *p = buf - 1;
     70     uint32_t i = 0;
     71     while ((p = strchr(p + 1, separator)) != NULL)
     72     {
     73         ++i;
     74     }
     75     return i;
     76 }
     77 
     78 int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name)
     79 {
     80     FILE * pFile;
     81     size_t result;
     82     fopen_s(&pFile, file_name, "wb");
     83     if (pFile == NULL) { fputs("File error
    ", stderr); return -1; }
     84     result = fwrite(buf, 1, buf_size, pFile);
     85     if (result != buf_size) { fputs("Writing error
    ", stderr); return -1; }
     86     return fclose(pFile);
     87 }
     88 
     89 int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator)
     90 {
     91     char buf[enmMaxWordsFileLength];
     92     StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "
    ");    //排除换行符
     93     uint32_t nWordsCount = GetWordsCount(buf, buf_size,',');
     94     printf("words_count=%d
    ", nWordsCount);
     95     arrSensitiveWord = new SensitiveWord[nWordsCount];
     96     if (arrSensitiveWord == NULL){return -1;}
     97     nSensitiveWordCnt = 0;
     98     const char *p = NULL,*q = buf;
     99     while ((p = strchr(q, separator)) != NULL)
    100     {
    101         memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q);
    102         //printf("%s
    ", arrSensitiveWord[nSensitiveWordCnt].szWord);
    103         q = p + 1;
    104         ++nSensitiveWordCnt;
    105     }
    106     BuildWordMap();
    107     return 0;
    108 }
    109 
    110 char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list)
    111 {
    112     uint32_t i = 0, j = 0, flag = 0;
    113     const char *p = NULL;
    114     if (dst == NULL && src == NULL)return NULL;
    115     if (dst == src)return dst;
    116     for (; j < dst_len && src[i] != ''; ++i)
    117     {
    118         flag = 0;
    119         p = exclude_list;
    120         while (p && *p != '')
    121         {
    122             if (*p == src[i]){ flag = 1; break; }
    123             p++;
    124         }
    125         if (flag == 0)dst[j++] = src[i];
    126     }
    127     dst[j] = '';
    128     return dst;
    129 }
    130 
    131 uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[])
    132 {
    133     int32_t code = 0;
    134     int32_t len = strlen(gbk_buf);
    135     if (len == 0)return 0;
    136     if (gbk_buf[0] >= 0 || len == 1)
    137     {
    138         //printf("%c
    ", gbk_buf[0]);
    139         return uint32_t(gbk_buf[0]);    //ASCII 字符
    140     }
    141     else
    142     {
    143         short high = (short)gbk_buf[0] + 256;
    144         short low = (short)gbk_buf[1] + 256;
    145         code = high * 256 + low;
    146         char cstr[3];
    147         cstr[0] = gbk_buf[0];    // GBK严格按照两个字节表示一个中文字符
    148         cstr[1] = gbk_buf[1];
    149         cstr[2] = 0;
    150         //printf("%s %x
    ", cstr, code);
    151         return code;
    152     }
    153 }
    154 
    155 uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[])
    156 {
    157     uint32_t code = 0;
    158     int32_t len = strlen(utf8_buf);
    159     if (len == 0)return 0;
    160     if (utf8_buf[0] >= 0 || len == 1)
    161     {
    162         printf("%c
    ", utf8_buf[0]);
    163         return int32_t(utf8_buf[0]);    //ASCII 字符
    164     }
    165     else
    166     {
    167         short high = (short)utf8_buf[0];
    168         short mid = (short)utf8_buf[1];
    169         short low = (short)utf8_buf[2];
    170         code = high * 256 * 256 + mid * 256 + low;
    171         char cstr[4];
    172         cstr[0] = utf8_buf[0];    // UTF8大多数情况下三个字节表示一个中文字符
    173         cstr[1] = utf8_buf[1];
    174         cstr[2] = utf8_buf[2];
    175         cstr[3] = 0;
    176         printf("%s
    ", cstr);
    177         return code;
    178     }
    179 }
    180 
    181 uint32_t SensitiveWordsChecker::GetFirstChar(char buf[])
    182 {
    183     uint32_t code = 0;
    184     int32_t len = strlen(buf);
    185     if (len == 0)return 0;
    186     return (uint32_t)buf[0];
    187 }
    188 
    189 void SensitiveWordsChecker::BuildWordMap()
    190 {
    191     WordList *wordList = NULL;
    192     for (uint32_t i = 0; i < nSensitiveWordCnt; ++i)
    193     {
    194         uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord);
    195         WordMap::iterator it = mapWords.find(code);
    196         if (it == mapWords.end())
    197         {
    198             wordList = new WordList();
    199             mapWords[code] = wordList;
    200         }
    201         else
    202         {
    203             wordList = it->second;
    204         }
    205         wordList->push_back(&arrSensitiveWord[i]);
    206     }
    207     DumpWordMap();
    208     GenTestData();
    209     Test();
    210 }
    211 
    212 void SensitiveWordsChecker::DumpWordMap()
    213 {
    214     uint32_t word_cnt = 0,i = 0;
    215     WordMap::const_iterator it = mapWords.begin();
    216     for (; it != mapWords.end(); ++it)
    217     {
    218         //printf("%u : %u
    ", i++, it->second->size());
    219         word_cnt += it->second->size();
    220     }
    221     printf("word_cnt = %u
    ", word_cnt);
    222 }
    223 
    224 int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[])
    225 {
    226     // 先把被检测字符串转换为GBK编码
    227     char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength];
    228     UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength);
    229     // 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表
    230     int32_t gbk_buf_len = strlen(gbk_buf);
    231     uint32_t code = 0, flag = 0, out_gbk_buf_len = 0;
    232     char c = 0, cstr[3] = { 0 };
    233     for (int32_t i = 0; i < gbk_buf_len;)
    234     {
    235         flag = 0;
    236         if (gbk_buf[i] >= 0 || i == gbk_buf_len - 1)
    237         {
    238             c = gbk_buf[i];
    239             //printf("%c
    ", c);   //ASCII字符
    240             code = (uint32_t)c;
    241             flag = 1;
    242             out_gbk_buf[out_gbk_buf_len] = c;
    243         }
    244         else
    245         {
    246             flag = 2;
    247             short high = (short)gbk_buf[i] + 256;
    248             short low = (short)gbk_buf[i + 1] + 256;
    249             code = high * 256 + low;
    250 
    251             cstr[0] = gbk_buf[i];
    252             cstr[1] = gbk_buf[i + 1];
    253             cstr[2] = 0;
    254 
    255             out_gbk_buf[out_gbk_buf_len] = cstr[0];
    256             out_gbk_buf[out_gbk_buf_len + 1] = cstr[1];
    257             //printf("%s
    ", cstr);
    258         }
    259         // 检查敏感词
    260         const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]);
    261         int32_t word_len = 0;
    262         if (NULL != sensitiveWord)
    263         {
    264             flag = 0;
    265             //printf("%s
    ", sensitiveWord->szWord);
    266             word_len = strlen(sensitiveWord->szWord);
    267             memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len);
    268         }
    269         int32_t step = word_len + flag;
    270         i += step;
    271         out_gbk_buf_len += step;
    272     }
    273     out_gbk_buf[out_gbk_buf_len] = '';
    274     //printf("out_gbk_buf = %s
    ", out_gbk_buf);
    275     GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength);
    276     return 0;
    277 }
    278 
    279 const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos)
    280 {
    281     int32_t word_len = 0;
    282     WordMap::const_iterator it = mapWords.find(code);
    283     if (it == mapWords.end()){ return NULL; }
    284     WordList *wordList = it->second;
    285     for (uint32_t i = 0; i < wordList->size(); i++)
    286     {
    287         const SensitiveWord *sensitiveWord = (*wordList)[i];
    288         word_len = strlen(sensitiveWord->szWord);
    289         // 如果内容一样,就说明是敏感词
    290         if (memcmp(sensitiveWord->szWord, pos, word_len) == 0)
    291         {
    292             return sensitiveWord;
    293         }
    294     }
    295     return NULL;
    296 }
    297 
    298 void SensitiveWordsChecker::GenTestData()
    299 {
    300     char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength];
    301     LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt");
    302     int32_t len = strlen(in_gbk_buf);
    303     uint32_t n = 0;
    304     for (int32_t i = 0; i < len && n < enmMaxWordsFileLength;++i)
    305     {
    306         if (i % 4 == 0 && short(in_gbk_buf[i]) > 0)
    307         {
    308             int32_t nRandIndex = rand() % nSensitiveWordCnt;
    309             SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex];
    310             int32_t word_len = strlen(sensitiveWord.szWord);
    311             for (int32_t j = 0; j < word_len && n < enmMaxWordsFileLength; ++j)
    312             {
    313                 out_gbk_buf[n++] = sensitiveWord.szWord[j];
    314             }
    315         }
    316         out_gbk_buf[n++] = in_gbk_buf[i];
    317     }
    318     out_gbk_buf[n] = '';
    319     char out_utf8_buf[enmMaxWordsFileLength];
    320     GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength);
    321     WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt");
    322 }
    323 
    324 void SensitiveWordsChecker::Test()
    325 {
    326     const int32_t max_line_len = 1024;
    327     char utf8_buf[enmMaxWordsFileLength];
    328     char out_utf8_buf[enmMaxWordsFileLength];
    329     LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt");
    330     const char *p = NULL, *q = utf8_buf;
    331     uint32_t offset = 0;
    332     while ((p = strchr(q, '
    ')) != NULL)
    333     {
    334         char in_uft8_line[max_line_len] = { 0 };
    335         char out_uft8_line[max_line_len] = { 0 };
    336         char out_gbk_line[max_line_len] = { 0 };
    337         memcpy(in_uft8_line, q, p - q);
    338         UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len);
    339         printf("%s
    ", out_gbk_line);
    340         CheckSensitiveWord(out_uft8_line, in_uft8_line);
    341         q = p + 1;
    342         char gbk[enmMaxContentLength];
    343         UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength);
    344         printf("%s
    ", gbk);
    345         StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line);
    346     }
    347     WriteToFile(out_utf8_buf, offset, "test_data_ret.txt");
    348 }
    349 
    350 void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...)
    351 {
    352     va_list argptr;
    353     va_start(argptr, fmt);
    354     if (offset < bufLen)
    355     {
    356         offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr);
    357     }
    358     va_end(argptr);
    359 }
    View Code

    测试效果:

    完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997

  • 相关阅读:
    fork-vfork -exit&_exit
    drop_cache-sar
    性能问题eg
    性能工具-mem
    性能工具-io工具
    linux后台开发常用调试工具
    GDB的原理
    可变参数以及stdcall
    linux 中断softirq tasklet
    linux kernel RCU 以及读写锁
  • 原文地址:https://www.cnblogs.com/tangxin-blog/p/5615579.html
Copyright © 2011-2022 走看看