zoukankan      html  css  js  c++  java
  • 分词之最短编辑距离算法实现(包括中文)

    参考自:https://blog.csdn.net/ac540101928/article/details/52786435

    上面链接的方法详细讲解了最短编辑距离算法,但不能处理中文字符。

    unicode和utf-8互转https://www.cnblogs.com/cthon/p/9297232.html

    #include "EditDistance.h"
    #include <string>
    
    using std::cout;
    using std::endl;
    using std::string;
      
    //判断字符的字节长,以便区分编码规则,实现utf-8编码
    /// 获取一个字节高位开头为1的个数
    size_t nBytesCode(const char ch)
    {
    	if(ch & (1 << 7))//如果ch是多字节的,下面循环,判断utf-8编码的字节长
    	{
    		int nBytes = 1;
    		for(int idx = 0; idx != 6; ++idx)
    		{
    			if(ch & (1 << (6 - idx)))
    			{
    				++nBytes;	
    			}
    			else
    				break;
    		}
    		return nBytes;//返回字节长
    	}
    	return 1;
    }  
      
      
    #if 0
    //该算法复杂了,不够简洁
    size_t nBytesCode(const char ch)
    {
    	size_t nBytes = 0;
    	if(ch &(1 << 7))
    	{//对中文进行处理-utf8编码
    		if((ch & 0xF0) == 0xC0 || (ch & 0xF0) == 0xD0)   //  1111 0000
    		{												 // &11xx xxxx
    			nBytes += 2;								 //  1100 0000	
    		}												 //  1101 0000
    		else if((ch & 0xF0) == 0xE0)
    		{
    			nBytes += 3;
    		}
    		else if((ch & 0xFF) == 0xF0 ||
    				(ch & 0xFF) == 0xF1 ||
    				(ch & 0xFF) == 0xF2 ||
    				(ch & 0xFF) == 0xF3 ||
    				(ch & 0xFF) == 0xF4 ||
    				(ch & 0xFF) == 0xF5 ||
    				(ch & 0xFF) == 0xF6 ||
    				(ch & 0xFF) == 0xF7 )
    		{
    			nBytes += 4;
    		}
    		else if((ch & 0xFF) == 0xF8 ||
    				(ch & 0xFF) == 0xF9 ||
    				(ch & 0xFF) == 0xFA ||
    				(ch & 0xFF) == 0xFB) 
    		{
    			nBytes += 5;
    		}
    		else if((ch & 0xFF) == 0xFC)
    		{
    			nBytes += 6;
    		}
    	}
    	else
    	{//1字节编码或英文
    		nBytes += 1;	
    	}
    	return nBytes;
    }
    #endif
    
    std::size_t length(const std::string &str)
    {
    	std::size_t ilen = 0;
    	for(std::size_t idx = 0; idx != str.size(); ++idx)
    	{
    		int nBytes = nBytesCode(str[idx]);
    		idx += (nBytes - 1);
    		++ilen;
    	}
    	return ilen;
    }
    
    int triple_min(const int &a, const int &b, const int &c)
    {
    	return a < b ? (a < c ? a : c) : (b < c ? b : c);
    }
    
    int editDistance(const std::string & lhs, const std::string &rhs)
    {//计算最小编辑距离-包括处理中英文
    	size_t lhs_len = length(lhs);//字符长
    	size_t rhs_len = length(rhs);
    	size_t blhs_len = length(lhs);//字节长
    	size_t brhs_len = length(rhs);
    
    	int editDist[lhs_len + 1][rhs_len + 1];
    	for(size_t idx = 0; idx <= lhs_len; ++idx)
    	{
    		editDist[idx][0] = idx;
    	}
    
    	for(size_t idx = 0; idx <= rhs_len; ++idx)
    	{
    		editDist[0][idx] = idx;
    	}
    	
    	std::string sublhs, subrhs;
    	for(std::size_t dist_i = 1, lhs_idx = 0; dist_i <= lhs_len && lhs_idx <= blhs_len; ++dist_i, ++lhs_idx)//lhs_idx<=blhs_len一定要加上,防止substr处理越界,自己调试几下就清楚了
    	{
    		size_t nBytes = nBytesCode(lhs[lhs_idx]);
    		sublhs = lhs.substr(lhs_idx, nBytes);
    		lhs_idx += (nBytes - 1);
    
    		for(std::size_t dist_j = 1, rhs_idx = 0; dist_j <= rhs_len && rhs_idx <= brhs_len; ++dist_j, ++rhs_idx)
    		{
    			nBytes = nBytesCode(rhs[rhs_idx]);
    			subrhs = rhs.substr(rhs_idx, nBytes);
    			rhs_idx += (nBytes - 1);
    			if(sublhs == subrhs)
    			{
    				editDist[dist_i][dist_j] = editDist[dist_i - 1][dist_j - 1];
    			}
    			else
    			{
    				editDist[dist_i][dist_j] = triple_min(
    					editDist[dist_i][dist_j - 1] + 1,
    					editDist[dist_i - 1][dist_j] + 1,
    					editDist[dist_i - 1][dist_j - 1] + 1);
    			}
    		}
    	}
    	return editDist[lhs_len][rhs_len];
    }
    

      

      

  • 相关阅读:
    【JQuery Easy UI】后台管理系统的简单布局分享
    Effective JavaScript Item 10 避免使用with
    娓娓道来c指针 (4)解析c的声明语句
    打造敏捷外包团队的高度自主与自我学习的生态系统
    LeetCode --- Count And Say
    RAD Studio XE8 技术研讨会讲义与范例程序下载
    SpringMVC工作原理
    SpringMVC 学习笔记(十一) SpirngMVC执行流程
    转 jeecg3.5中多数据源的配置
    浅谈JEECG多数据源的使用
  • 原文地址:https://www.cnblogs.com/cthon/p/9298751.html
Copyright © 2011-2022 走看看