1.DFA算法
DFA算法的原理可以参考 这里 ,简单来说就是通过Map构造出一颗敏感词树,树的每一条由根节点到叶子节点的路径构成一个敏感词,例如下图:
代码简单实现如下:
public class TextFilterUtil {
//日志
private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);
//敏感词库
private static HashMap sensitiveWordMap = null;
//默认编码格式
private static final String ENCODING = "gbk";
//敏感词库的路径
private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");
/**
* 初始化敏感词库
*/
private static void init() {
//读取文件
Set<String> keyWords = readSensitiveWords();
//创建敏感词库
sensitiveWordMap = new HashMap<>(keyWords.size());
for (String keyWord : keyWords) {
createKeyWord(keyWord);
}
}
/**
* 构建敏感词库
*
* @param keyWord
*/
private static void createKeyWord(String keyWord) {
if (sensitiveWordMap == null) {
LOG.error("sensitiveWordMap 未初始化!");
return;
}
Map nowMap = sensitiveWordMap;
for (Character c : keyWord.toCharArray()) {
Object obj = nowMap.get(c);
if (obj == null) {
Map<String, Object> childMap = new HashMap<>();
childMap.put("isEnd", "false");
nowMap.put(c, childMap);
nowMap = childMap;
} else {
nowMap = (Map) obj;
}
}
nowMap.put("isEnd", "true");
}
/**
* 读取敏感词文件
*
* @return
*/
private static Set<String> readSensitiveWords() {
Set<String> keyWords = new HashSet<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, ENCODING));
String line;
while ((line = reader.readLine()) != null) {
keyWords.add(line.trim());
}
} catch (UnsupportedEncodingException e) {
LOG.error("敏感词库文件转码失败!");
} catch (FileNotFoundException e) {
LOG.error("敏感词库文件不存在!");
} catch (IOException e) {
LOG.error("敏感词库文件读取失败!");
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
reader = null;
}
}
return keyWords;
}
/**
* 检查敏感词
*
* @return