zoukankan html css js c++ java

相似度分析，循环读入文件(加入了HanLP,算法第四版的库)

相似度分析的，其中的分词可以采用HanLP即可：

http://www.open-open.com/lib/view/open1421978002609.htm

/***********************************************************   
* @Title      : SimilarityAnalyse.java 
* @Package    :  lsg.hawei.hanlp 
* @Description: TODO(用一句话描述该文件做什么) 
* @author     : liang shan guang 
* @date       :2016年11月8日 上午12:41:10 
* @version    : V1.0   
***********************************************************/
package lsg.hawei.hanlp;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;

import edu.princeton.cs.algs4.In;

/*********************************************************** 
* @ClassName   : SimilarityAnalyse 
* @Description : 用于相似度分析的库函数 
* @author      :liang shan guang
* @date        :2016年11月8日 上午12:41:10 
***********************************************************/
public class SimilarityAnalyse
{
    //阈值,用于决定语言分析和语序分析占相似度的百分比，此处0.2为语已占比
    public static double YUZHI = 0.2 ;
    public static Vector<String> participle( String str ) 
    {
        
        Vector<String> str1 = new Vector<String>() ;//对输入进行分词
        Segment segment=HanLP.newSegment().enableCustomDictionary(true);
        CustomDictionary.add("梁山广");//动态添加自定义词汇
        List<Term> termList=segment.seg(str);
        for(Term term:termList)
        {
//                System.out.println(term.toString());
            str1.add(term.toString());
        }        
        
        if( str1.size() == 0 ) 
        {
            return null ;
        }
        
        //分词后
        System.out.println( "str分词后：" + str1 );
        return str1;
    }
    
    public static double getSimilarity(Vector<String> T1, Vector<String> T2) throws Exception 
    {
        int size = 0 , size2 = 0 ;
        if ( T1 != null && ( size = T1.size() ) > 0 && T2 != null && ( size2 = T2.size() ) > 0 ) {
            
            Map<String, double[]> T = new HashMap<String, double[]>();
            
            //T1和T2的并集T
            String index = null ;
            for ( int i = 0 ; i < size ; i++ ) {
                index = T1.get(i) ;
                if( index != null){
                    double[] c = T.get(index);
                    c = new double[2];
                    c[0] = 1;    //T1的语义分数Ci
                    c[1] = YUZHI;//T2的语义分数Ci
                    T.put( index, c );
                }
            }
     
            for ( int i = 0; i < size2 ; i++ ) {
                index = T2.get(i) ;
                if( index != null ){
                    double[] c = T.get( index );
                    if( c != null && c.length == 2 ){
                        c[1] = 1; //T2中也存在，T2的语义分数=1
                    }else {
                        c = new double[2];
                        c[0] = YUZHI; //T1的语义分数Ci
                        c[1] = 1; //T2的语义分数Ci
                        T.put( index , c );
                    }
                }
            }
                
            //开始计算，百分比
            Iterator<String> it = T.keySet().iterator();
            double s1 = 0 , s2 = 0, Ssum = 0;  //S1、S2
            while( it.hasNext() ){
                double[] c = T.get( it.next() );
                Ssum += c[0]*c[1];
                s1 += c[0]*c[0];
                s2 += c[1]*c[1];
            }
            //百分比
            return Ssum / Math.sqrt( s1*s2 );
        } else {
            throw new Exception("传入参数有问题！");
        }
    }

    /************************************************************* 
    * @Title      : main 
    * @Description: TODO(这里用一句话描述这个方法的作用) 
    * @param      ：   @param args    设定文件 
    * @return     ：void    返回类型 
    * @throws 
    *************************************************************/
    public static void main(String[] args)
    {
        String currentFolder = System.getProperty("user.dir");
        String fileFolder    = currentFolder+"\file\";
        String fileName1     = fileFolder+"wait2Compare.txt";//读入待分析的数据
        String fileName2     = fileFolder+"standardStrs.txt";//读入标准的数据
        String[] wait2Compare=In.readStrings(fileName1);
        String[] standardStrs=In.readStrings(fileName2);
        for(String str1:wait2Compare)
        {
            for(String str2:standardStrs)
            {
                Vector<String> testLine1=participle(str1);
                Vector<String> testLine2=participle(str2);
                try
                {
                    double similarity=getSimilarity(testLine1,testLine2);
                    System.out.println("两个句子的相似度为:"+similarity);
                } catch (Exception e)
                {
                    // TODO Auto-generated catch block
                    System.out.println("相似度 计算失败，失败原因如下：");
                    e.printStackTrace();
                }
            }
        }
        
    }

}

查看全文

相关阅读:
真香警告！多线程分类表情包爬取，一起斗图叭(*^▽^*)~~~
小白入门爬虫快速上手（详细步骤）
利用selenium尝试爬取豆瓣图书
 OpenCV图像人脸检测及视频中的人脸检测（附源码）
pyhton爬取爱豆（李易峰）微博评论（附源码）
Python爬取最爱的电影并下载到本地（附源码）
[转载]关于RNA的种类和组成
 2020年中国基因测序产业竞争格局全局观
 Improving and correcting the contiguity of long-read genome assemblies of three plant species using optical mapping and chromosome conformation capture data
三代测序

原文地址：https://www.cnblogs.com/lsgwr/p/6040994.html