zoukankan      html  css  js  c++  java
  • 对比俩个字符串的相似度

    package com.opslab.util.algorithmImpl;

    import com.opslab.util.CharsetUtil;
    import com.opslab.util.SysUtil;

    import java.io.ByteArrayInputStream;
    import java.io.InputStream;
    import java.io.UnsupportedEncodingException;

    /**
    * 对比俩个字符串的相似度
    */
    public class StringImpl {
    //第一种实现方式
    private static String longestCommonSubstring(String strA, String strB) {
    char[] chars_strA = strA.toCharArray();
    char[] chars_strB = strB.toCharArray();
    int m = chars_strA.length;
    int n = chars_strB.length;
    int[][] matrix = new int[m + 1][n + 1];
    for (int i = 1; i <= m; i++) {
    for (int j = 1; j <= n; j++) {
    if (chars_strA[i - 1] == chars_strB[j - 1])
    matrix[i][j] = matrix[i - 1][j - 1] + 1;
    else
    matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);
    }
    }
    char[] result = new char[matrix[m][n]];
    int currentIndex = result.length - 1;
    while (matrix[m][n] != 0) {
    if (matrix[n] == matrix[n - 1])
    n--;
    else if (matrix[m][n] == matrix[m - 1][n])
    m--;
    else {
    result[currentIndex] = chars_strA[m - 1];
    currentIndex--;
    n--;
    m--;
    }
    }
    return new String(result);
    }

    private static boolean charReg(char charValue) {
    return (charValue >= 0x4E00 && charValue <= 0X9FA5) || (charValue >= 'a' && charValue <= 'z') || (charValue >= 'A' && charValue <= 'Z') || (charValue >= '0' && charValue <= '9');
    }

    private static String removeSign(String str) {
    StringBuffer sb = new StringBuffer();
    for (char item : str.toCharArray()){
    if (charReg(item)) {
    sb.append(item);
    }
    }
    return sb.toString();
    }

    /**
    * 快速比较俩个字符串的相似度
    *
    * @param strA 较长的字符串
    * @param strB 较短的字符串
    * @return 俩个字符串的相似度
    * <p>summary</p>:较长的字符串放到前面有助于提交效率
    */
    public static double SimilarDegree(String strA, String strB) {
    String newStrA = removeSign(strA);
    String newStrB = removeSign(strB);
    int temp = Math.max(newStrA.length(), newStrB.length());
    int temp2 = longestCommonSubstring(newStrA, newStrB).length();
    return temp2 * 1.0 / temp;
    }

    //第二种实现方式
    private static int compare(String str, String target) {
    int d[][]; // 矩阵
    int n = str.length();
    int m = target.length();
    int i; // 遍历str的
    int j; // 遍历target的
    char ch1; // str的
    char ch2; // target的
    int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
    if (n == 0) {
    return m;
    }
    if (m == 0) {
    return n;
    }
    d = new int[n + 1][m + 1];
    for (i = 0; i <= n; i++) { // 初始化第一列
    d[i][0] = i;
    }

    for (j = 0; j <= m; j++) { // 初始化第一行
    d[0][j] = j;
    }

    for (i = 1; i <= n; i++) { // 遍历str
    ch1 = str.charAt(i - 1);
    // 去匹配target
    for (j = 1; j <= m; j++) {
    ch2 = target.charAt(j - 1);
    if (ch1 == ch2) {
    temp = 0;
    } else {
    temp = 1;
    }

    // 左边+1,上边+1, 左上角+temp取最小
    d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + temp);
    }
    }
    return d[n][m];
    }

    private static int min(int one, int two, int three) {
    return (one = one < two ? one : two) < three ? one : three;
    }

    /**
    * 获取字符串的相似度
    *
    * @param str
    * @param target
    * @return
    */
    public static double SimilarityRatio(String str, String target) {
    return 1 - (double) compare(str, target) / Math.max(str.length(), target.length());
    }


    /**
    * 获取字符串编码
    *
    * @param str 需要处理的字符串
    */
    public static String simpleEncoding(String str) {
    try{
    byte[] bs = str.getBytes(SysUtil.JVM_ENCODING);
    if(str.equals(new String(bs,CharsetUtil.UTF_8))){
    return CharsetUtil.UTF_8;
    }
    if(str.equals(new String(bs,CharsetUtil.GBK))){
    return CharsetUtil.GBK;
    }
    if(str.equals(new String(bs,"ISO-8859-1"))){
    return "ISO-8859-1";
    }
    }catch(UnsupportedEncodingException e) {
    System.out.println("111111111");
    e.printStackTrace();
    }
    String encode = "GB2312";

    try {
    if (str.equals(new String(str.getBytes(encode), encode))) {
    return encode;
    }
    } catch (UnsupportedEncodingException e) {
    e.printStackTrace();
    }

    encode = "ISO-8859-1";
    try {
    if (str.equals(new String(str.getBytes(encode), encode))) {
    return encode;
    }
    } catch (UnsupportedEncodingException exception1) {
    exception1.printStackTrace();
    }
    encode = "UTF-8";
    try {
    if (str.equals(new String(str.getBytes(encode), encode))) {
    return encode;
    }
    } catch (UnsupportedEncodingException exception1) {
    exception1.printStackTrace();
    }
    encode = "GBK";
    try {
    if (str.equals(new String(str.getBytes(encode), encode))) {
    return encode;

    }
    } catch (UnsupportedEncodingException exception1) {
    exception1.printStackTrace();
    }
    return "";
    }


    }

  • 相关阅读:
    删除Openstack所有组件
    redis + twemproxy 分布式集群与部署
    Nginx 负载均衡动静分离配置
    【读书笔记】sklearn翻译
    【机器学习算法应用和学习_2_理论篇】2.2 聚类_kmeans
    【机器学习算法应用和学习_1_基础篇】1.2 pandas
    【python脚本】提供图片url,批量下载命名图片
    【机器学习算法应用和学习_3_代码API篇】3.2 M_分类_逻辑回归
    【机器学习算法应用和学习_2_理论篇】2.2 M_分类_逻辑回归
    【机器学习算法应用与学习_3_代码API篇】3.8分类模型封装
  • 原文地址:https://www.cnblogs.com/chinaifae/p/10254654.html
Copyright © 2011-2022 走看看