zoukankan      html  css  js  c++  java
  • java实现字符串匹配问题之求两个字符串的最大公共子串

    转载请注明出处:http://blog.csdn.net/xiaojimanman/article/details/38924981

    近期在项目工作中有一个关于文本对照的需求,经过这段时间的学习,总结了这篇博客内容:求两个字符串的最大公共子串。

    算法思想:基于图计算两字符串的公共子串。详细算法思想參照下图:




    输入字符串S1:achmacmh    输入字符串S2:macham

    1)第a步,是将字符串s1,s2分别按字节拆分,构成一个二维数组;

    2)二维数组中的值如b所看到的,比方第一行第一列的值表示字符串s2和s1的第一个字节是否相等,若相等就是1,否则就是0,终于产生b所看到的的二维数组;

    3)分别求二维数组中斜线上的公共因子(斜线为元素a右下角值,即a[i][j]的下一个元素是a[i+1][j+1];公共因子为1所在的位置构成的字符串);

    4)对全部公共因子排序,返回最大的公共因子的值。


    详细的实现代码例如以下所看到的:

    package cn.lulei.compare;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.Comparator;
    import java.util.List;
    
    public class StringCompare {
    	private int a;
    	private int b;
    	
    	public String getMaxLengthCommonString(String s1, String s2) {
    		if (s1 == null || s2 == null)  {
    			return null;
    		}
    		a = s1.length();//s1长度做行
    		b = s2.length();//s2长度做列
    		if(a== 0 || b == 0) {
    			return "";
    		}
    		//设置匹配矩阵
    		boolean [][] array = new boolean[a][b];
    		for (int i = 0; i  < a; i++) {
    			char c1 = s1.charAt(i);
    			for (int j = 0; j < b; j++) {
    				char c2 = s2.charAt(j);
    				if (c1 == c2) {
    					array[i][j] = true;
    				}  else {
    					array[i][j] = false;
    				}
    			}
    		}
    		//求全部公因子字符串,保存信息为相对第二个字符串的起始位置和长度
    		List<ChildString> childStrings = new ArrayList<ChildString>();
    		for (int i = 0; i < a; i++) {
    			getMaxSort(i, 0, array, childStrings);
    		}
    		for (int i = 1; i < b; i++) {
    			getMaxSort(0, i, array, childStrings);
    		}
    		//排序
    		sort(childStrings);
    		if (childStrings.size() < 1) {
    			return "";
    		}
    		//返回最大公因子字符串
    		int max = childStrings.get(0).maxLength;
    		StringBuffer sb = new StringBuffer();
    		for (ChildString s: childStrings) {
    			if (max != s.maxLength) {
    				break;
    			}
    			sb.append(s2.substring(s.maxStart, s.maxStart + s.maxLength));
    			sb.append("
    ");
    		}
    		return sb.toString();
    	}
    	
    	//排序,倒叙
    	private void sort(List<ChildString> list) {
    		Collections.sort(list, new Comparator<ChildString>(){
    			public int compare(ChildString o1, ChildString o2) {
    				return o2.maxLength - o1.maxLength;
    			}
    		});
    	}
    	
    	//求一条斜线上的公因子字符串
    	private void getMaxSort(int i, int j, boolean [][] array, List<ChildString> sortBean) {
    		int length = 0;
    		int start = j;
    		for (; i < a && j < b; i++,j++) {
    			if (array[i][j]) {
    				length++;
    			} else {
    				sortBean.add(new ChildString(length, start));
    				length = 0;
    				start = j + 1;
    			}
    			if (i == a-1 || j == b-1) {
    				sortBean.add(new ChildString(length, start));
    			}
    		}
    	}
    	
    	//公因子类
    	class ChildString {
    		int maxLength;
    		int maxStart;
    		
    		ChildString(int maxLength, int maxStart){
    			this.maxLength = maxLength;
    			this.maxStart = maxStart;
    		}
    	}
    
    	/**
    	 * @param args
    	 */
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		System.out.println(new StringCompare().getMaxLengthCommonString("achmacmh", "macham"));
    	}
    }
    

    程序终于运行结果是:


    对于两个文件的比对个人觉得能够參照这样的算法思想(自己如今并为实现),在日后的博客中将会写到。


    上述实现过程中,用数组保存了全部的公共子串信息,然后排序取最大的子串,这样的做法假设仅仅是求最大子串的话,算法就不是非常合理,因此做了例如以下改动,List仅仅保存当前计算中最大的子串,详细实现例如以下:

     /**  
     *@Description: 字符串比較   
     */ 
    package com.lulei.test;
    
    import java.util.ArrayList;
    import java.util.List;
    
    public class StringCompare {
    	private int a;
    	private int b;
    	private int maxLength = -1;
    	
    	public String getMaxLengthCommonString(String s1, String s2) {
    		if (s1 == null || s2 == null)  {
    			return null;
    		}
    		a = s1.length();//s1长度做行
    		b = s2.length();//s2长度做列
    		if(a== 0 || b == 0) {
    			return "";
    		}
    		//设置匹配矩阵
    		boolean [][] array = new boolean[a][b];
    		for (int i = 0; i  < a; i++) {
    			char c1 = s1.charAt(i);
    			for (int j = 0; j < b; j++) {
    				char c2 = s2.charAt(j);
    				if (c1 == c2) {
    					array[i][j] = true;
    				}  else {
    					array[i][j] = false;
    				}
    			}
    		}
    		//求全部公因子字符串,保存信息为相对第二个字符串的起始位置和长度
    		List<ChildString> childStrings = new ArrayList<ChildString>();
    		for (int i = 0; i < a; i++) {
    			getMaxSort(i, 0, array, childStrings);
    		}
    		for (int i = 1; i < b; i++) {
    			getMaxSort(0, i, array, childStrings);
    		}
    		StringBuffer sb = new StringBuffer();
    		for (ChildString s: childStrings) {
    			sb.append(s2.substring(s.maxStart, s.maxStart + s.maxLength));
    			sb.append("
    ");
    		}
    		return sb.toString();
    	}
    	
    	//求一条斜线上的公因子字符串
    	private void getMaxSort(int i, int j, boolean [][] array, List<ChildString> sortBean) {
    		int length = 0;
    		int start = j;
    		for (; i < a && j < b; i++,j++) {
    			if (array[i][j]) {
    				length++;
    			} else {
    				//直接add,保存全部子串,以下的推断,仅仅保存当前最大的子串
    				//sortBean.add(new ChildString(length, start));
    				if (length == maxLength) {
    					sortBean.add(new ChildString(length, start));
    				} else if (length > maxLength) {
    					sortBean.clear();
    					maxLength = length;
    					sortBean.add(new ChildString(length, start));
    				}
    				length = 0;
    				start = j + 1;
    			}
    			if (i == a-1 || j == b-1) {
    				//直接add,保存全部子串,以下的推断,仅仅保存当前最大的子串
    				//sortBean.add(new ChildString(length, start));
    				if (length == maxLength) {
    					sortBean.add(new ChildString(length, start));
    				} else if (length > maxLength) {
    					sortBean.clear();
    					maxLength = length;
    					sortBean.add(new ChildString(length, start));
    				}
    			}
    		}
    	}
    	
    	//公因子类
    	class ChildString {
    		int maxLength;
    		int maxStart;
    		
    		ChildString(int maxLength, int maxStart){
    			this.maxLength = maxLength;
    			this.maxStart = maxStart;
    		}
    	}
    
    	/**
    	 * @param args
    	 */
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		System.out.println(new StringCompare().getMaxLengthCommonString("abcdef", "defabc"));
    	}
    }
    


  • 相关阅读:
    PHP实现栈(Stack)数据结构
    为什么推荐std::string而不是char*
    PHP实现插入排序算法
    OpenCms Application dev-ref
    OpenCMS integration with Spring MVC--reference
    安装opencms时遇到问题及解决方法
    Java ZIP File Example---refernce
    Top 10 Algorithms for Coding Interview--reference
    JVMInternals--reference
    java code to byte code--partone--reference
  • 原文地址:https://www.cnblogs.com/bhlsheji/p/4230274.html
Copyright © 2011-2022 走看看