zoukankan      html  css  js  c++  java
  • 快速求熵程序

    信息熵很有用,就拿我的老本行反病毒来说,它是静态病毒特征的常见组成部分。信息熵的计算公式很简单:

    $ Entropy=-sum_{i=1}^{n}P(X_{i})log_{2}P(X_{i}) $

    其中,$ P(X_{i}) $是随机变量$X_{i}$出现的概率,这里熵的单位是bit。通常,我们用$C(X_{i})$表示$X_{i}$出现的次数,$T=sum_{i=1}^{n}C(X_{i})$表示观察次数的总和,那么:

    $ Entropy=-sum_{i=1}^{n}frac{C(X_{i})}{T}log_{2}frac{C(X_{i})}{T} $

    根据这个公式,最直接也是最常见的求熵程序类似于:

    double entropy(int *counts, int n, int total)
    {
        double sum = 0;
        double p;
        for(int i = 0; i < n; i++){
            if(counts[i] > 0){
                p = (double) counts[i] / total;
                sum -= p * log(p);
            }
        }
        return sum / log(2.0);
    }

    比如,要计算某段内存数据的熵,一般是这样调用entropy函数的:

    double entropy_data(unsigned char *data, int size)
    {
        int counts[256];
        memset(counts, 0, sizeof(int) * 256);
        for(int i = 0; i < size; i++)
            counts[data[i]]++;
        return entropy(counts, 256, size);
    }

    如果内存数据里的字节是均匀分布的(比如随机字节序列),那么熵值接近于8。反之,某几个字节大量重复出现时,熵值接近于0。在病毒特征中,特定文件数据的熵值是判断文件是否经过压缩加密的重要指标。

    在实际应用中,我们对熵的精度要求并不高,15个有效位的double乃至7个有效位的float都显得有些“浪费”,而浮点运算又使得熵的计算速度不敢恭维。那么,是否可以在牺牲部分精度的前提下,快速计算近似熵值呢?答案是肯定的。经过几天的琢磨,终于写了一个出来。

    在贴出代码之前,先看一下计算100万次熵值的用时对比数据(单位:ms)。测试环境:i7-3520M @2.9GHz, 8GB RAM, Win7 SP1 64-bit。

      x86 编译 x64 编译
    entropy 4575.6 9851.5
    fast_entropy 677.1 405.6

    x86版本的fast_entropy速度大约是entropy的6倍,x64版本的fast_entropy速度是entropy的24倍。结果精度方面,fast_entropy一般可以精确至小数点后3到4位,严格的误差分析还没有做,有时间会补上。

    fast_entropy用了整数运算来替代浮点运算,并应用了IEEE浮点数存储格式的技巧,得以用整数运算近似Log运算。之所以在x64下表现更好,是因为64-bit整数操作在x64上更快。

    好了,上代码!

    static int _u[256] = {
        0xc0801c36, 0xc0805429, 0xc0808b65, 0xc080c1eb, 0xc080f7bf, 0xc0812cdf, 0xc081614f, 0xc081950f,
        0xc081c821, 0xc081fa86, 0xc0822c3f, 0xc0825d4d, 0xc0828db3, 0xc082bd71, 0xc082ec88, 0xc0831af9,
        0xc08348c7, 0xc08375f1, 0xc083a27a, 0xc083ce62, 0xc083f9ab, 0xc0842455, 0xc0844e63, 0xc08477d4,
        0xc084a0aa, 0xc084c8e6, 0xc084f08a, 0xc0851796, 0xc0853e0c, 0xc08563ec, 0xc0858937, 0xc085adef,
        0xc085d215, 0xc085f5a9, 0xc08618ad, 0xc0863b21, 0xc0865d07, 0xc0867e60, 0xc0869f2c, 0xc086bf6c,
        0xc086df22, 0xc086fe4e, 0xc0871cf2, 0xc0873b0e, 0xc08758a2, 0xc08775b1, 0xc087923b, 0xc087ae40,
        0xc087c9c2, 0xc087e4c1, 0xc087ff3f, 0xc088193c, 0xc08832b9, 0xc0884bb7, 0xc0886436, 0xc0887c38,
        0xc08893bd, 0xc088aac6, 0xc088c155, 0xc088d768, 0xc088ed03, 0xc0890224, 0xc08916cd, 0xc0892aff,
        0xc0893ebb, 0xc0895200, 0xc08964d1, 0xc089772d, 0xc0898916, 0xc0899a8b, 0xc089ab8e, 0xc089bc20,
        0xc089cc41, 0xc089dbf2, 0xc089eb34, 0xc089fa06, 0xc08a086b, 0xc08a1662, 0xc08a23ec, 0xc08a310a,
        0xc08a3dbc, 0xc08a4a04, 0xc08a55e1, 0xc08a6155, 0xc08a6c60, 0xc08a7702, 0xc08a813d, 0xc08a8b10,
        0xc08a947d, 0xc08a9d84, 0xc08aa626, 0xc08aae62, 0xc08ab63b, 0xc08abdb0, 0xc08ac4c2, 0xc08acb71,
        0xc08ad1be, 0xc08ad7aa, 0xc08add35, 0xc08ae260, 0xc08ae72b, 0xc08aeb97, 0xc08aefa4, 0xc08af353,
        0xc08af6a4, 0xc08af998, 0xc08afc30, 0xc08afe6b, 0xc08b004b, 0xc08b01d0, 0xc08b02fa, 0xc08b03ca,
        0xc08b0440, 0xc08b045e, 0xc08b0422, 0xc08b038f, 0xc08b02a4, 0xc08b0161, 0xc08affc8, 0xc08afdd9,
        0xc08afb94, 0xc08af8f9, 0xc08af609, 0xc08af2c5, 0xc08aef2d, 0xc08aeb42, 0xc08ae703, 0xc08ae271,
        0xc08add8e, 0xc08ad858, 0xc08ad2d1, 0xc08accf9, 0xc08ac6d0, 0xc08ac057, 0xc08ab98e, 0xc08ab276,
        0xc08aab0f, 0xc08aa35a, 0xc08a9b56, 0xc08a9305, 0xc08a8a66, 0xc08a817a, 0xc08a7841, 0xc08a6ebd,
        0xc08a64ec, 0xc08a5ad0, 0xc08a5069, 0xc08a45b8, 0xc08a3abc, 0xc08a2f76, 0xc08a23e6, 0xc08a180d,
        0xc08a0bec, 0xc089ff81, 0xc089f2cf, 0xc089e5d5, 0xc089d893, 0xc089cb0a, 0xc089bd3b, 0xc089af25,
        0xc089a0c8, 0xc0899227, 0xc089833f, 0xc0897413, 0xc08964a2, 0xc08954ec, 0xc08944f2, 0xc08934b5,
        0xc0892434, 0xc089136f, 0xc0890268, 0xc088f11f, 0xc088df93, 0xc088cdc5, 0xc088bbb6, 0xc088a965,
        0xc08896d4, 0xc0888401, 0xc08870ef, 0xc0885d9c, 0xc0884a09, 0xc0883637, 0xc0882226, 0xc0880dd5,
        0xc087f946, 0xc087e479, 0xc087cf6e, 0xc087ba24, 0xc087a49e, 0xc0878eda, 0xc08778d9, 0xc087629b,
        0xc0874c21, 0xc087356b, 0xc0871e78, 0xc087074b, 0xc086efe1, 0xc086d83d, 0xc086c05e, 0xc086a844,
        0xc0868ff0, 0xc0867762, 0xc0865e9a, 0xc0864598, 0xc0862c5e, 0xc08612ea, 0xc085f93d, 0xc085df58,
        0xc085c53a, 0xc085aae4, 0xc0859057, 0xc0857592, 0xc0855a95, 0xc0853f61, 0xc08523f7, 0xc0850855,
        0xc084ec7e, 0xc084d070, 0xc084b42c, 0xc08497b2, 0xc0847b03, 0xc0845e1e, 0xc0844105, 0xc08423b6,
        0xc0840633, 0xc083e87c, 0xc083ca90, 0xc083ac71, 0xc0838e1d, 0xc0836f96, 0xc08350dc, 0xc08331ee,
        0xc08312ce, 0xc082f37b, 0xc082d3f5, 0xc082b43d, 0xc0829453, 0xc0827437, 0xc08253ea, 0xc082336b,
        0xc08212ba, 0xc081f1d9, 0xc081d0c7, 0xc081af83, 0xc0818e10, 0xc0816c6c, 0xc0814a98, 0xc0812894,
        0xc0810660, 0xc080e3fd, 0xc080c16b, 0xc0809ea9, 0xc0807bb8, 0xc0805899, 0xc080354a, 0xc08011ce,
    };
    
    static inline long long _lxlogx(int x)
    {
        float f = (float)x;
        int i = *(int *)&f;
        i += _u[(i & 0x007F8000) >> 15];
        return (long long)i * x; 
    }
    
    double fast_entropy(int *counts, int n, int total)
    {
        long long s = 0;
        int i, c;
        for(i = 0; i < n; i++) {
            if(c = counts[i])
                s -= _lxlogx(c);
        }
        s += _lxlogx(total);
        s /= total;
        return 0.00000011920929 * s;
    }
  • 相关阅读:
    2019.1.8兔子问题和汉诺塔问题的解决代码
    REST
    存储过程和函数练习
    十六、性能优化
    十五、MySQl日志
    Shell入门
    十四、数据备份
    十三、MySQL触发器
    十二、视图
    十一、MySQL锁
  • 原文地址:https://www.cnblogs.com/daishuo/p/3954711.html
Copyright © 2011-2022 走看看