【转】计算文档相似度（英文）

zoukankan html css js c++ java

【转】计算文档相似度（英文）
转自：http://blog.chinaunix.net/uid-26548237-id-3541783.html

1、向量空间模型
向量空间模型作为向量的标识符，是一个用来表示文本文件的代数模型。它应用于信息过滤、信息检索、索引以及相关规则。
文档和问题都用向量来表示。

每一维都相当于一个独立的词组。如果这个术语出现在文档中，那它在向量中的值就非零。已经有很多不同的方法来计算这些值，这些值叫做（词组）权重。其中一种广为人知的算法就是tf_idf权重。我们是根据应用来定义词组的。典型的词组就是一个单一词、关键词、或者较长的短语。如果字被选为词组，那么向量的维数就是出现在词汇表中不同字的个数。向量运算能通过查询来比较各文档。

通过文档相似度理论的假设，比较每个文档向量和原始查询向量（两个向量的类型是相同的）之间的角度偏差，使得在文档搜索关键词的关联规则是能够计算的。实际上，计算向量之间夹角的余弦比直接计算夹角本身要简单。

其中d₂*q是文档向量（即下图中的d₂）和查询向量（即下图中的q）的点乘；分母分别为两个向量的模。向量的模通过下面的公式计算：

由于这个模型所考虑的所有向量都是严格非负的，如果其余弦值为零，则表示查询向量和文档向量是正交的，即不符合（换句话说，就是该检索词在文档中没有找到），即两篇文档的相似度为0%。

下面是一个tf-idf权重的例子。


  优点：
相对于标准的布尔数学模型，向量空间模型具有如下优点：
1、基于线性代数的简单模型；
2、词组的权重不是二元的；
3、允许计算文档和索引之间的连续相似程度；
4、允许其根据可能的相关性来进行文件排序；
5、允许局部匹配；

局限：
1、不适用于较长的文件，因为它的相似度值不理想；
2、检索词组必须与文件中出现的词组精确匹配；
3、语义敏感度不佳，具有相同的语境但使用不同的词组的文件不能被关联起来；
4、词组在文档中出现的顺序在向量中间无法表示；
5、假定词组在统计上是独立的；
6、权重是直观上获得的而不够正式；

2、向量空间模型的使用
下面是利用向量空间模型来计算文件的相似度。以上面讲诉的余弦值Cosine为例，进行实现。
实现中的权重直接使用的是词出现的频率，另外，这里比较的是英文的相似度。
#include <iostream>

#include <map>

#include <sys/stat.h>

#include <cmath>

using namespace std;

#define ERROR -1

#define OK 0

#define DEBUG

//用于去除文本中的无关紧要的词

//const char delim[] = " .,:;`/\"+i-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";

const char delim[] = " .,:;'`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";

char *strtolower(char *word)

{

    char *s;



    for(s = word; *s; s++)

    {

        *s = tolower(*s);

    }

    return word;

}

int ReadFile(char *text_name, map<string, int> &word_count)

{

    char *str;

    char *word;

    char *file;

    struct stat sb;

    FILE *fp = fopen(text_name, "r");



    if(fp == NULL)

    {

        return ERROR;

    }



    if(stat(text_name, &sb))

    {

        return ERROR;

    }



    file = (char *)malloc(sb.st_size);

    if(file == NULL)

    {

        fclose(fp);

        return ERROR;

    }

    fread(file, sizeof(char), sb.st_size, fp);

    word = strtok(file, delim);



    while(word != NULL)

    {

        //delete the length of word <= 1

        if(strlen(word) <= 1)

        {

            word = strtok(NULL, delim);

            continue;

        }



        str = strtolower(strdup(word));

        string tmp = str;

        word_count[tmp]++;

        word = strtok(NULL, delim);

    }

}

int main(int argc, char **argv)

{

    char *text_name_one = "./big.txt";

    //char *text_name_one = "./1.txt";

    char *text_name_two = "./big.txt";

    //char *text_name_two = "./2.txt";



    map<string, int> word_count_one;

    map<string, int> word_count_two;



    double multi_one = 0.0;

    double multi_two = 0.0;

    double multi_third = 0.0;

    if(ReadFile(text_name_one, word_count_one) == ERROR)

    {

        cout << "ReadFile() error." << endl;

        return ERROR;

    }

#ifdef DEBUG

    map<string, int>::iterator map_first = word_count_one.begin();

    for( ; map_first != word_count_one.end(); map_first++)

    {

        cout << map_first->first << " " << map_first->second << endl;

    }

#endif

    if(ReadFile(text_name_two, word_count_two) == ERROR)

    {

        cout << "ReadFile() error." << endl;

        return ERROR;

    }

#ifdef DEBUG

    map<string, int>::iterator map_second = word_count_two.begin();

    for( ; map_second != word_count_two.end(); map_second++)

    {

        cout << map_second->first << " " << map_second->second << endl;

    }

#endif

    map<string, int>::iterator map_one = word_count_one.begin();

    map<string, int>::iterator map_tmp;

    for( ; map_one != word_count_one.end(); map_one++)

    {

        map_tmp = word_count_two.find(map_one->first);

        if(map_tmp == word_count_two.end())

        {

            multi_two += map_one->second * map_one->second;

            continue;

        }

        multi_one += map_one->second * map_tmp->second;

        multi_two += map_one->second * map_one->second;

        multi_third += map_tmp->second * map_tmp->second;

        word_count_two.erase(map_one->first);    //从2中删除1中具有的

    }

    //检查2中是否仍然有元素

    for(map_tmp = word_count_two.begin(); map_tmp != word_count_two.end(); map_tmp++)

    {

        multi_third += map_tmp->second * map_tmp->second;

    }

    multi_two = sqrt(multi_two);

    multi_third = sqrt(multi_third);

    double result = multi_one / ( multi_two * multi_third);

    cout << "相似度为: " << result * 100 << "%" << endl;

    return 0;

}
下面进行测试。
第一、进行检测两个相同的英文文本，文本链接为http://norvig.com/big.txt

给出了文本中词的部分统计，可以看到，两个相同文本的相似度为100%。

第二、文本1内容：......this is one! 文本2的内容：()()()......this is two

运行结果与实际手算的结果相同，两个文本的相似度为66.6667%。




以上只是简单的进行两个英文文本的相似度计算，只是在词条的层次上进行计算，并没有涉及到语义，所以，相对比较简单。
我对这方面非常感兴趣，还会继续学习其他相关的内容。


理论知识引自：http://zh.wikipedia.org/wiki/%E5%90%91%E9%87%8F%E7%A9%BA%E9%96%93%E6%A8%A1%E5%9E%8B
查看全文

相关阅读:
Ubuntu 16.04 安装 Apache, MySQL, PHP7
Ubuntu下apache2启动、停止、重启、配置
 织梦-数据库-表和字段说明手册
 DEDECMS去除后门隐患和漏洞以及冗余代码的方法
 Express使用html模板
 windows系统安装MongoDB
linux搭建node.js环境
 配置vuejs加载模拟数据
 安卓高级5 zXing
安卓高级5 传感器和震动模仿微信摇一摇Ui效果

原文地址：https://www.cnblogs.com/jackyzzy/p/3011769.html