Aho和Corasick对KMP算法(Knuth–Morris–Pratt algorithm)进行了改进,Aho-Corasick算法(Aho-Corasick algorithm)利用构建树,总时间复杂度是O(n)。原理图如下(摘自Aho-Corasick string matching in C#):
Building of the keyword tree (figure 1 - after the first step, figure 2 - tree with the fail function)
C#版本的实现代码可以从Aho-Corasick string matching in C#得到,也可以点击这里获得该算法的PDF文档。
这是一个应用示例:
它能将载入的RTF文档中的搜索关键字高亮,检索速度较快,示例没有实现全字匹配,算法代码简要如下:
- /* Aho-Corasick text search algorithm implementation
- *
- * For more information visit
- * - http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
- */
- using System;
- using System.Collections;
- namespace EeekSoft.Text
- {
- /// <summary>
- /// Interface containing all methods to be implemented
- /// by string search algorithm
- /// </summary>
- public interface IStringSearchAlgorithm
- {
- #region Methods & Properties
- /// <summary>
- /// Ignore case of letters
- /// </summary>
- bool IgnoreCase { get; set; }
- /// <summary>
- /// List of keywords to search for
- /// </summary>
- string[] Keywords { get; set; }
- /// <summary>
- /// Searches passed text and returns all occurrences of any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>Array of occurrences</returns>
- StringSearchResult[] FindAll(string text);
- /// <summary>
- /// Searches passed text and returns first occurrence of any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
- StringSearchResult FindFirst(string text);
- /// <summary>
- /// Searches passed text and returns true if text contains any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>True when text contains any keyword</returns>
- bool ContainsAny(string text);
- #endregion
- }
- /// <summary>
- /// Structure containing results of search
- /// (keyword and position in original text)
- /// </summary>
- public struct StringSearchResult
- {
- #region Members
- private int _index;
- private string _keyword;
- /// <summary>
- /// Initialize string search result
- /// </summary>
- /// <param name="index">Index in text</param>
- /// <param name="keyword">Found keyword</param>
- public StringSearchResult(int index, string keyword)
- {
- _index = index; _keyword = keyword;
- }
- /// <summary>
- /// Returns index of found keyword in original text
- /// </summary>
- public int Index
- {
- get { return _index; }
- }
- /// <summary>
- /// Returns keyword found by this result
- /// </summary>
- public string Keyword
- {
- get { return _keyword; }
- }
- /// <summary>
- /// Returns empty search result
- /// </summary>
- public static StringSearchResult Empty
- {
- get { return new StringSearchResult(-1, ""); }
- }
- #endregion
- }
- /// <summary>
- /// Class for searching string for one or multiple
- /// keywords using efficient Aho-Corasick search algorithm
- /// </summary>
- public class StringSearch : IStringSearchAlgorithm
- {
- #region Objects
- /// <summary>
- /// Tree node representing character and its
- /// transition and failure function
- /// </summary>
- class TreeNode
- {
- #region Constructor & Methods
- /// <summary>
- /// Initialize tree node with specified character
- /// </summary>
- /// <param name="parent">Parent node</param>
- /// <param name="c">Character</param>
- public TreeNode(TreeNode parent, char c)
- {
- _char = c; _parent = parent;
- _results = new ArrayList();
- _resultsAr = new string[] { };
- _transitionsAr = new TreeNode[] { };
- _transHash = new Hashtable();
- }
- /// <summary>
- /// Adds pattern ending in this node
- /// </summary>
- /// <param name="result">Pattern</param>
- public void AddResult(string result)
- {
- if (_results.Contains(result)) return;
- _results.Add(result);
- _resultsAr = (string[])_results.ToArray(typeof(string));
- }
- /// <summary>
- /// Adds trabsition node
- /// </summary>
- /// <param name="node">Node</param>
- //public void AddTransition(TreeNode node)
- //{
- // AddTransition(node, false);
- //}
- /// <summary>
- /// Adds trabsition node
- /// </summary>
- /// <param name="node">Node</param>
- /// <param name="ignoreCase">Ignore case of letters</param>
- public void AddTransition(TreeNode node, bool ignoreCase)
- {
- if (ignoreCase) _transHash.Add(char.ToLower(node.Char), node);
- else _transHash.Add(node.Char, node);
- TreeNode[] ar = new TreeNode[_transHash.Values.Count];
- _transHash.Values.CopyTo(ar, 0);
- _transitionsAr = ar;
- }
- /// <summary>
- /// Returns transition to specified character (if exists)
- /// </summary>
- /// <param name="c">Character</param>
- /// <param name="ignoreCase">Ignore case of letters</param>
- /// <returns>Returns TreeNode or null</returns>
- public TreeNode GetTransition(char c, bool ignoreCase)
- {
- if (ignoreCase)
- return (TreeNode)_transHash[char.ToLower(c)];
- return (TreeNode)_transHash[c];
- }
- /// <summary>
- /// Returns true if node contains transition to specified character
- /// </summary>
- /// <param name="c">Character</param>
- /// <param name="ignoreCase">Ignore case of letters</param>
- /// <returns>True if transition exists</returns>
- public bool ContainsTransition(char c, bool ignoreCase)
- {
- return GetTransition(c, ignoreCase) != null;
- }
- #endregion
- #region Properties
- private char _char;
- private TreeNode _parent;
- private TreeNode _failure;
- private ArrayList _results;
- private TreeNode[] _transitionsAr;
- private string[] _resultsAr;
- private Hashtable _transHash;
- /// <summary>
- /// Character
- /// </summary>
- public char Char
- {
- get { return _char; }
- }
- /// <summary>
- /// Parent tree node
- /// </summary>
- public TreeNode Parent
- {
- get { return _parent; }
- }
- /// <summary>
- /// Failure function - descendant node
- /// </summary>
- public TreeNode Failure
- {
- get { return _failure; }
- set { _failure = value; }
- }
- /// <summary>
- /// Transition function - list of descendant nodes
- /// </summary>
- public TreeNode[] Transitions
- {
- get { return _transitionsAr; }
- }
- /// <summary>
- /// Returns list of patterns ending by this letter
- /// </summary>
- public string[] Results
- {
- get { return _resultsAr; }
- }
- #endregion
- }
- #endregion
- #region Local fields
- /// <summary>
- /// Root of keyword tree
- /// </summary>
- private TreeNode _root;
- /// <summary>
- /// Keywords to search for
- /// </summary>
- private string[] _keywords;
- #endregion
- #region Initialization
- /// <summary>
- /// Initialize search algorithm (Build keyword tree)
- /// </summary>
- /// <param name="keywords">Keywords to search for</param>
- /// <param name="ignoreCase">Ignore case of letters (the default is false)</param>
- public StringSearch(string[] keywords, bool ignoreCase)
- : this(keywords)
- {
- IgnoreCase = ignoreCase;
- }
- /// <summary>
- /// Initialize search algorithm (Build keyword tree)
- /// </summary>
- /// <param name="keywords">Keywords to search for</param>
- public StringSearch(string[] keywords)
- {
- Keywords = keywords;
- }
- /// <summary>
- /// Initialize search algorithm with no keywords
- /// (Use Keywords property)
- /// </summary>
- public StringSearch()
- { }
- #endregion
- #region Implementation
- /// <summary>
- /// Build tree from specified keywords
- /// </summary>
- void BuildTree()
- {
- // Build keyword tree and transition function
- _root = new TreeNode(null, ' ');
- foreach (string p in _keywords)
- {
- // add pattern to tree
- TreeNode nd = _root;
- foreach (char c in p)
- {
- TreeNode ndNew = null;
- foreach (TreeNode trans in nd.Transitions)
- {
- if (this.IgnoreCase)
- {
- if (char.ToLower(trans.Char) == char.ToLower(c)) { ndNew = trans; break; }
- }
- else
- {
- if (trans.Char == c) { ndNew = trans; break; }
- }
- }
- if (ndNew == null)
- {
- ndNew = new TreeNode(nd, c);
- nd.AddTransition(ndNew, this.IgnoreCase);
- }
- nd = ndNew;
- }
- nd.AddResult(p);
- }
- // Find failure functions
- ArrayList nodes = new ArrayList();
- // level 1 nodes - fail to root node
- foreach (TreeNode nd in _root.Transitions)
- {
- nd.Failure = _root;
- foreach (TreeNode trans in nd.Transitions) nodes.Add(trans);
- }
- // other nodes - using BFS
- while (nodes.Count != 0)
- {
- ArrayList newNodes = new ArrayList();
- foreach (TreeNode nd in nodes)
- {
- TreeNode r = nd.Parent.Failure;
- char c = nd.Char;
- while (r != null && !r.ContainsTransition(c, this.IgnoreCase)) r = r.Failure;
- if (r == null)
- nd.Failure = _root;
- else
- {
- nd.Failure = r.GetTransition(c, this.IgnoreCase);
- foreach (string result in nd.Failure.Results)
- nd.AddResult(result);
- }
- // add child nodes to BFS list
- foreach (TreeNode child in nd.Transitions)
- newNodes.Add(child);
- }
- nodes = newNodes;
- }
- _root.Failure = _root;
- }
- #endregion
- #region Methods & Properties
- /// <summary>
- /// Ignore case of letters
- /// </summary>
- public bool IgnoreCase
- {
- get;
- set;
- }
- /// <summary>
- /// Keywords to search for (setting this property is slow, because
- /// it requieres rebuilding of keyword tree)
- /// </summary>
- public string[] Keywords
- {
- get { return _keywords; }
- set
- {
- _keywords = value;
- BuildTree();
- }
- }
- /// <summary>
- /// Searches passed text and returns all occurrences of any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>Array of occurrences</returns>
- public StringSearchResult[] FindAll(string text)
- {
- ArrayList ret = new ArrayList();
- TreeNode ptr = _root;
- int index = 0;
- while (index < text.Length)
- {
- TreeNode trans = null;
- while (trans == null)
- {
- trans = ptr.GetTransition(text[index], this.IgnoreCase);
- if (ptr == _root) break;
- if (trans == null) ptr = ptr.Failure;
- }
- if (trans != null) ptr = trans;
- foreach (string found in ptr.Results)
- ret.Add(new StringSearchResult(index - found.Length + 1, found));
- index++;
- }
- return (StringSearchResult[])ret.ToArray(typeof(StringSearchResult));
- }
- /// <summary>
- /// Searches passed text and returns first occurrence of any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
- public StringSearchResult FindFirst(string text)
- {
- ArrayList ret = new ArrayList();
- TreeNode ptr = _root;
- int index = 0;
- while (index < text.Length)
- {
- TreeNode trans = null;
- while (trans == null)
- {
- trans = ptr.GetTransition(text[index], this.IgnoreCase);
- if (ptr == _root) break;
- if (trans == null) ptr = ptr.Failure;
- }
- if (trans != null) ptr = trans;
- foreach (string found in ptr.Results)
- return new StringSearchResult(index - found.Length + 1, found);
- index++;
- }
- return StringSearchResult.Empty;
- }
- /// <summary>
- /// Searches passed text and returns true if text contains any keyword
- /// </summary>
- /// <param name="text">Text to search</param>
- /// <returns>True when text contains any keyword</returns>
- public bool ContainsAny(string text)
- {
- TreeNode ptr = _root;
- int index = 0;
- while (index < text.Length)
- {
- TreeNode trans = null;
- while (trans == null)
- {
- trans = ptr.GetTransition(text[index], this.IgnoreCase);
- if (ptr == _root) break;
- if (trans == null) ptr = ptr.Failure;
- }
- if (trans != null) ptr = trans;
- if (ptr.Results.Length > 0) return true;
- index++;
- }
- return false;
- }
- #endregion
- }
- }