zoukankan      html  css  js  c++  java
  • DFA和trie特里实现敏感词过滤(python和c语言)

    今天的项目是与完成python开展,需要使用做关键词检查,筛选分类,使用前c语言做这种事情。有了线索,非常高效,内存小了,检查快。

    到达python在,第一个想法是pip基于外观的c语言python特里模块。不幸的是,我们没有找到一个合适的,假设我会用c书写python模块的话。就自己写一个了,可惜我还不具备这个能力。

    仅仅能用python写了,性能差一点就差点吧,内存多一点也无所谓了。


    用搜索引擎看CSDN上的网友的用python实现的DFA,再參照自己曾经用c语言写过的字典树。有些不大对,就自己写了一个。想象一下假设用C语言是会很高效,并且空间也特别小。

     某位网友的:DFA 算法实现敏感词过滤(python 实现)


    以下是python代码:

    class cNode(object):
    	def __init__(self):
    		self.children = None
    		
    # The encode of word is UTF-8
    # The encode of message is UTF-8
    class cDfa(object):
    	def __init__(self,lWords):
    		self.root=None
    		self.root=cNode()
    		for sWord in lWords:
    			self.addWord(sWord)
    
    	# The encode of word is UTF-8
    	def addWord(self,word):
    		node = self.root
    		iEnd=len(word)-1
    		for i in xrange(len(word)):
    			if node.children == None:
    				node.children = {}
    				if i!=iEnd:
    					node.children[word[i]]=(cNode(),False)
    				else:
    					node.children[word[i]]=(cNode(),True)
    
    			elif word[i] not in node.children:
    				if i!=iEnd:
    					node.children[word[i]]=(cNode(),False)
    				else:
    					node.children[word[i]]=(cNode(),True)
    			else: #word[i] in node.children:
    				if i==iEnd:
    					Next,bWord=node.children[word[i]]
    					node.children[word[i]]=(Next,True)
    
    			node=node.children[word[i]][0]
    
    	def isContain(self,sMsg):
    		root=self.root
    		iLen=len(sMsg)
    		for i in xrange(iLen):
    			p = root
    			j = i
    			while (j<iLen and p.children!=None and sMsg[j] in p.children):
    				(p,bWord) = p.children[sMsg[j]]
    				if bWord:
    					return True
    				j = j + 1
    		return False
    
    	def filter(self,sMsg):
    		lNew=[]
    		root=self.root
    		iLen=len(sMsg)
    		i=0
    		bContinue=False
    		while i<iLen:
    			p=root
    			j=i
    			while (j<iLen and p.children!=None and sMsg[j] in p.children):
    				(p,bWord) = p.children[sMsg[j]]
    				if bWord:
    					#print sMsg[i:j+1]
    					lNew.append(u'*'*(j-i+1))#keyword替换
    					i=j+1
    					bContinue=True
    					break
    				j=j+1
    			if bContinue:
    				bContinue=False
    				continue
    			lNew.append(sMsg[i])
    			i=i+1
    		return ''.join(lNew)
    
    


    以下是c语言代码trie_tree.h:

    #ifndef _TRIE_TREE_H_INCLUDED_
    #define _TRIE_TREE_H_INCLUDED_
    
    
    #define WORD_NUM          256
    struct trie_node {
    	struct trie_node *node[WORD_NUM];
    	int value;
    	int exist;
    };
    
    struct trie_node *create_trie_node(int value);
    void trie_tree_insert_word(struct trie_node *root, unsigned char *word);
    /* return 1 表示存在, return 0表示不存在 */
    int tire_word_is_exist(struct trie_node *root, unsigned char *word);
    void destory_trie_tree(struct trie_node *root);
    void update_trie_tree(struct trie_node **root, const char *filename);
    
    #endif
    

    trie_tree.c:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <trie_tree.h>
    
    
    
    
    struct trie_node *create_trie_node(int value)
    {
    	struct trie_node * node = calloc(1, sizeof(struct trie_node));
    	node->value = value;
    	return node;
    }
    
    int tire_word_is_exist(struct trie_node *root, unsigned char *word)
    {
    	struct trie_node *n = NULL;
    	unsigned char *p = NULL;
    	
    	if (root == NULL) {
    		return 0;
    	}
    	
    	while (*word != 0) {
    		p = word++;
    		n = root;
    		while (*p != 0) {
    			n = n->node[*p];
    			if (n == NULL) {
    				break;
    			}
    			else if (n->exist == 1) {
    				return 1;
    			}
    			p++;
    		}
    	}
    		
    	return 0;
    }
    
    void trie_tree_insert_word(struct trie_node *root, unsigned char *word) 
    {
    	struct trie_node *n;
    	while (*word != 0) {
    		n = root->node[*word];
    		if (n == NULL) {
    			n = create_trie_node(*word);
    			root->node[*word] = n;
    		}
    		root = n;
    		word++;
    	}
    	root->exist = 1;
    }
    
    void destroy_trie_tree(struct trie_node *root) 
    {
    	int i;
    	if (root == NULL) {
    		return;
    	}
    	for (i = 0; i < WORD_NUM; i++) {
    		destroy_trie_tree(root->node[i]);
    	}
    	free(root);
    }
    
    
    void update_trie_tree(struct trie_node **root, const char *filename)
    {
    	char word[1024];
    	FILE *fp;
    	char *p;
    	
    	if (*root != NULL) {
    		destroy_trie_tree(*root);
    	}
    
    	*root = calloc(sizeof(**root),1);
    
    	fp = fopen(filename, "r");
    	if (fp == NULL) {
    		printf("file can't open %s
    ", filename);
    		return;
    	}
    
    	while (fgets(word, sizeof(word), fp)) {
    		p = word;
    
    		while (*p != 0) {
    			if (*p == '
    ' || *p == '
    ' || *p == ' ') {
    				*p = 0;
    				break;
    			}
    			p++;
    		}
    		trie_tree_insert_word(*root, (unsigned char *)word);
    	}
    }
    		
    



    版权声明:本文博主原创文章,博客,未经同意不得转载。

  • 相关阅读:
    [WARNING] 找不到编译器:wepy-compiler-less。 [Error] 未发现相关 less 编译器配置,请检查wepy.config.js文件。
    wepy-cli : Failed to download repo standard: getaddrinfo ENOENT raw.githubusercontent.com
    火币Huobi 币安 Binance ios 反编译
    iPhone 反编译-微信多开分身版原理,一部iPhone登录多个微信号
    安卓微信分身版 一部手机同时登录多个微信号 原理
    微信smali反编译 查看撤回消息
    微信抢红包
    Dota2 荒神罪 破解
    微信smali反编译 语音消息转发功能
    mac安装mysql流程
  • 原文地址:https://www.cnblogs.com/lcchuguo/p/4881284.html
Copyright © 2011-2022 走看看