具体思路:
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
1 #ifndef SENSITIVE_WORDS_CHECKER_ 2 #define SENSITIVE_WORDS_CHECKER_ 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <memory.h> 6 #include <map> 7 #include <vector> 8 9 enum { 10 enmMaxWordLength = 32, //每个敏感词最大长度 11 enmMaxWordsFileLength = 1024 * 128, //敏感词文件最大长度128k 12 enmMaxContentLength = 1024, // 单次检测内容测最大长度 13 }; 14 15 struct SensitiveWord 16 { 17 char szWord[enmMaxWordLength]; 18 SensitiveWord() 19 { 20 memset(szWord, 0, enmMaxWordLength); 21 } 22 }; 23 24 typedef std::vector<SensitiveWord*> WordList; 25 typedef std::map<uint32_t, WordList*> WordMap; 26 27 class SensitiveWordsChecker 28 { 29 public: 30 SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){} 31 ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; } 32 public: 33 void LoadWordsFromUTF8File(const char *file_name); 34 void LoadWordsFromGBKFile(const char *file_name); 35 protected: 36 int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name); 37 void DumpWordMap(); 38 void GenTestData(); 39 void Test(); 40 void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...); 41 private: 42 int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name); 43 int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen); 44 int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen); 45 int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen); 46 uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator); 47 char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list); 48 int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator); 49 void BuildWordMap(); 50 uint32_t GetFirstCharFromGBK(char gbk_buf[]); 51 uint32_t GetFirstCharFromTUF8(char utf8_buf[]); 52 uint32_t GetFirstChar(char buf[]); 53 // 返回 0 表示in_utf8_buf里面没有敏感词 54 // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf 55 int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]); 56 const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos); 57 private: 58 SensitiveWord *arrSensitiveWord; 59 uint32_t nSensitiveWordCnt; 60 WordMap mapWords; 61 }; 62 63 #endif
.cpp
1 #include "SenditiveWordsChecker.h" 2 #include "stdio.h" 3 #include "string.h" 4 #include "iconv.h" 5 #include <stdarg.h> 6 #include <new> 7 8 void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name) 9 { 10 char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength]; 11 LoadFile(utf8_buf, enmMaxWordsFileLength, file_name); 12 UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength); 13 GetWords(gbk_buf, enmMaxWordsFileLength, ','); 14 } 15 16 void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name) 17 { 18 char gbk_buf[enmMaxWordsFileLength]; 19 LoadFile(gbk_buf, enmMaxWordsFileLength, file_name); 20 GetWords(gbk_buf, enmMaxWordsFileLength,','); 21 } 22 23 int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name) 24 { 25 FILE * pFile; 26 size_t lSize = 0, result = 0; 27 fopen_s(&pFile, file_name, "rb"); 28 if (pFile == NULL) { fputs("File error ", stderr); return -1; } 29 // obtain file size: 30 fseek(pFile, 0, SEEK_END); 31 lSize = ftell(pFile); 32 rewind(pFile); 33 if (lSize >= buf_size){ fputs("file too large ", stderr); return -1; } 34 result = fread(buf, 1, lSize, pFile); 35 if (result != lSize) { fputs("Reading error ", stderr); return -1; } 36 buf[lSize] = '