zoukankan      html  css  js  c++  java
  • c#-SimHash匹配相似-算法

    使用场景:Google 的 simhash 算法

     //通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。
    
     //从我的经验,如果我们假定N是每个块的大小,M是重叠的字符的数目,N = 4和M = 3是最好的选择
    

      

        public class SimHashAnalyser : IAnalyser
        {
    
            private const int HashSize = 32;
    
            public float GetLikenessValue(string needle, string haystack)
            {
                var needleSimHash = this.DoCalculateSimHash(needle);
                var hayStackSimHash = this.DoCalculateSimHash(haystack);
                return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;
            }
    
            private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)
            {
                var hashedTokens = new List<int>();
                foreach (string token in tokens)
                {
                    hashedTokens.Add(token.GetHashCode());
                }
                return hashedTokens;
            }
    
            private static int GetHammingDistance(int firstValue, int secondValue)
            {
                var hammingBits = firstValue ^ secondValue;
                var hammingValue = 0;
                for (int i = 0; i < 32; i++)
                {
                    if (IsBitSet(hammingBits, i))
                    {
                        hammingValue += 1;
                    }
                }
                return hammingValue;
            }
    
            private static bool IsBitSet(int b, int pos)
            {
                return (b & (1 << pos)) != 0;
            }
    
            private int DoCalculateSimHash(string input)
            {
                ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);
                var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));
                var vector = new int[HashSize];
                for (var i = 0; i < HashSize; i++)
                {
                    vector[i] = 0;
                }
    
                foreach (var value in hashedtokens)
                {
                    for (var j = 0; j < HashSize; j++)
                    {
                        if (IsBitSet(value, j))
                        {
                            vector[j] += 1;
                        }
                        else
                        {
                            vector[j] -= 1;
                        }
                    }
                }
    
                var fingerprint = 0;
                for (var i = 0; i < HashSize; i++)
                {
                    if (vector[i] > 0)
                    {
                        fingerprint += 1 << i;
                    }
                }
                return fingerprint;
            }
    
    
        }
    
    
    
        public interface IAnalyser
        {
            float GetLikenessValue(string needle, string haystack);
        }
    
        public interface ITokeniser
        {
            IEnumerable<string> Tokenise(string input);
        }
    
        public class FixedSizeStringTokeniser : ITokeniser
        {
            private readonly ushort tokensize = 5;
            public FixedSizeStringTokeniser(ushort tokenSize)
            {
                if (tokenSize < 2 || tokenSize > 127)
                {
                    throw new ArgumentException("Token 不能超出范围");
                }
                this.tokensize = tokenSize;
            }
    
            public IEnumerable<string> Tokenise(string input)
            {
                var chunks = new List<string>();
                int offset = 0;
                while (offset < input.Length)
                {
                    chunks.Add(new string(input.Skip(offset).Take(this.tokensize).ToArray()));
                    offset += this.tokensize;
                }
                return chunks;
            }
    
        }
    
    
        public class OverlappingStringTokeniser : ITokeniser
        {
                  
            private readonly ushort chunkSize = 4;
            private readonly ushort overlapSize = 3;
    
            public OverlappingStringTokeniser(ushort chunkSize, ushort overlapSize)
            {
                if (chunkSize <= overlapSize)
                {
                    throw new ArgumentException("Chunck 必须大于 overlap");
                }
                this.overlapSize = overlapSize;
                this.chunkSize = chunkSize;
            }
    
            public IEnumerable<string> Tokenise(string input)
            {
                var result = new List<string>();
                int position = 0;
                while (position < input.Length - this.chunkSize)
                {
                    result.Add(input.Substring(position, this.chunkSize));
                    position += this.chunkSize - this.overlapSize;
                }
                return result;
            }
    
    
        }
    

      

    使用:

        const string HayStack = "中国香港………………";
        const string Needle = "中国香港 2013………………";
    
        IAnalyser analyser = new SimHashAnalyser();
        var likeness = analyser.GetLikenessValue(Needle, HayStack);
    
        Console.Clear();
        Console.WriteLine("Likeness: {0}%", likeness * 100);
        Console.ReadKey();
    

      

     SimHash for c#

  • 相关阅读:
    Unity的动态加载简单使用
    枚举的使用总结
    IIS WEB程序如何访问共享目录
    AngularJS之页面跳转Route
    ASP.NET MVC4 BundleConfig的注意事项
    Android自动化测试------monkey自定义脚本(四)
    Android自动化测试------monkey(三)
    Android自动化测试------monkey(二)
    Android自动化测试------monkey(一)
    (一)adb命令的使用
  • 原文地址:https://www.cnblogs.com/zengxiangzhan/p/3311114.html
Copyright © 2011-2022 走看看