zoukankan      html  css  js  c++  java
  • 利用Levenshtein Distance (编辑距离)实现文档相似度计算

    1.首先将word文档解压缩为zip

        /**
         * 修改后缀名
         */
        public static String reName(String path){
            File file=new File(path);
            String filename=file.getAbsolutePath();
            if(filename.indexOf(".")>=0){
                filename=filename.substring(0,filename.lastIndexOf("."));
            }
            file.renameTo(new File(filename+".zip"));
            return filename;
        }
        
        /**
         * 解压缩
         */
        public static File zipDeCompressing(String path){
            long startTime=System.currentTimeMillis();
            File Font=null;
            try{
                ZipInputStream Zin=new ZipInputStream(new FileInputStream(reName(path)+".zip"));
                BufferedInputStream Bin=new BufferedInputStream(Zin);
                
                String Parent=reName(path);
                ZipEntry entry;
                try{
                    while((entry=Zin.getNextEntry())!=null&&!entry.isDirectory()){
                        Font=new File(Parent,entry.getName());
                        if(!Font.exists()){
                            (new File(Font.getParent())).mkdirs();
                        }
                        FileOutputStream out=new FileOutputStream(Font);
                        BufferedOutputStream Bout=new BufferedOutputStream(out);
                        int b;
                        while((b=Bin.read())!=-1){
                            Bout.write(b);
                        }
                        Bout.close();
                        out.close();
                        System.out.println(Font+"解压成功");
                    }
                    Bin.close();
                    Zin.close();
                }catch(Exception e){
                    e.printStackTrace();
                }
            }catch(FileNotFoundException e){
                e.printStackTrace();
            }
            long endTime=System.currentTimeMillis();
            System.out.println("耗费时间:"+(endTime-startTime)+"ms");
            File file=new File(reName(path)+"/word/document.xml");
            return file;
        }

    2.利用Levenshtein Distance (编辑距离)计算文本相似度

    private static int min(int one, int two, int three) {
            int min = one;
            if (two < min) {
                min = two;
            }
            if (three < min) {
                min = three;
            }
            return min;
        }
     
        public static int ld(String str1, String str2) {
            int d[][]; // 矩阵
            int n = str1.length();
            int m = str2.length();
            int i; // 遍历str1的
            int j; // 遍历str2的
            char ch1; // str1的
            char ch2; // str2的
            int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
            if (n == 0) {
                return m;
            }
            if (m == 0) {
                return n;
            }
            d = new int[n + 1][m + 1];
            for (i = 0; i <= n; i++) { // 初始化第一列
                d[i][0] = i;
            }
            for (j = 0; j <= m; j++) { // 初始化第一行
                d[0][j] = j;
            }
            for (i = 1; i <= n; i++) { // 遍历str1
                ch1 = str1.charAt(i - 1);
                // 去匹配str2
                for (j = 1; j <= m; j++) {
                    ch2 = str2.charAt(j - 1);
                    if (ch1 == ch2) {
                        temp = 0;
                    } else {
                        temp = 1;
                    }
                    // 左边+1,上边+1, 左上角+temp取最小
                    d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]+ temp);
                }
            }
            return d[n][m];
        }
        public static double sim(String str1, String str2) {
            try {
                double ld = (double)ld(str1, str2);
                return (1-ld/(double)Math.max(str1.length(), str2.length()));
            } catch (Exception e) {
                return 0.1;
            }
        }

     源码下载地址:http://download.csdn.net/detail/xiangrikuigt/9696149

  • 相关阅读:
    JSTL 标签库
    C++(一)— stringstream的用法
    深度学习—反向传播的理解
    深度学习—线性分类器理解
    Python—numpy.bincount()
    Python—numpy.argsort()
    Python—numpy.flatnonzero()
    C++(零)— 提高程序运行效率
    机器学习(八)—GBDT 与 XGBOOST
    机器学习(七)—Adaboost 和 梯度提升树GBDT
  • 原文地址:https://www.cnblogs.com/gting/p/6108348.html
Copyright © 2011-2022 走看看