zoukankan      html  css  js  c++  java
  • 基于词典的逆向最大匹配中文分词算法,更好实现中英文数字混合分词

    基于词典的逆向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤。实际分词效果比正向分词效果好

    publicclass RMM
    {
    privatestaticfinal Log log = LogFactory.getLog(RMM.class);

    privatestatic HashMap<String, Integer> dictionary =null;
    privatestaticfinalint WORD_MAX_LENGTH =9;

    static
    {
    loadDictionary();
    }


    //将句子切分出词,逆向最大匹配
    publicstatic ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
    {
    Collections.reverse(list);
    ArrayList
    <Token> tokenlist=new ArrayList<Token>();
    for(Sentence sen:list)
    {
    StringBuffer word
    =new StringBuffer();
    int offset=sen.getStartOffset()+sen.getText().length;
    int bufferIndex = sen.getText().length-1;
    char c;
    boolean b=false;
    while(bufferIndex>-1)
    {
    offset
    --;
    c
    =sen.getText()[bufferIndex--];
    if(word.length()==0)
    word.append(c);
    else
    {
    String temp
    = (c+word.toString()).intern();
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
    word.insert(
    0, c);
    elseif(dictionary.containsKey(temp) && bufferIndex>-1)
    word.insert(
    0, c);
    else
    {
    bufferIndex
    ++;
    offset
    ++;
    while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2)
    {
    word.deleteCharAt(
    0);
    bufferIndex
    ++;
    offset
    ++;
    }

    b
    =true;
    }

    }

    if(b || bufferIndex==-1)
    {
    Token token
    =new Token(word.toString(),offset,offset+word.length(),"word");
    word.setLength(
    0);
    tokenlist.add(token);
    b
    =false;
    }

    }

    }

    Collections.reverse(tokenlist);
    return tokenlist;
    }


    //加载词典
    publicstaticvoid loadDictionary()
    {
    if (dictionary ==null)
    {
    dictionary
    =new HashMap<String, Integer>();
    InputStream is
    =null;
    BufferedReader br
    =null;
    try
    {
    is
    =new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
    br
    =new BufferedReader(new InputStreamReader(is, "UTF-8"));
    String word
    =null;
    while ((word = br.readLine()) !=null)
    {
    word
    =word.toLowerCase();
    if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH))
    {
    dictionary.put(word.intern(),
    1);
    int i =1;
    while(i < word.length()-1)
    {
    String temp
    = word.substring(i,word.length()).intern();
    if (!dictionary.containsKey(temp))
    dictionary.put(temp,
    2);
    i
    ++;
    }

    }

    }

    }

    catch (Exception e)
    {
    log.info(e);
    }

    finally
    {
    try
    {
    if(br!=null)
    br.close();
    if(is!=null)
    is.close();
    }

    catch (IOException e)
    {
    log.info(e);
    }

    }

    }

    }


    publicstatic String[] segWords(Reader reader)
    {
    ArrayList
    <String> list=new ArrayList<String>();
    try
    {
    ArrayList
    <Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
    for(Token t:tlist)
    {
    list.add(t.getWord());
    }

    }

    catch(IOException e)
    {
    log.info(e);
    }

    return (String[])list.toArray(new String[0]);
    }


    publicstaticvoid main(String[] args)
    {
    String[] cc
    =RMM.segWords(new StringReader("急、急、急、花里林居,二房二厅,业主诚心,出租".toLowerCase()));
    for(String c:cc)
    {
    System.out.println(c);
    }

    }

    }


    public class Util
    {
    //切分出由中文、字母、数字组成的句子
    public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
    {
    ArrayList<Sentence> list=new ArrayList<Sentence>();
    StringBuffer cb=new StringBuffer();
    int d=reader.read();
    int offset=0;
    boolean b=false;
    while(d>-1)
    {
    int type=Character.getType(d);
    if(type==2 || type==9 || type==5)
    {
    d=toAscii(d);
    cb.append((char)d);
    }
    else
    {
    b=true;
    }
    d=reader.read();
    if(d==-1 || b)
    {
    if(d==-1) offset++;
    b=false;
    char[] ioBuffer = new char[cb.length()];
    cb.getChars(0, cb.length(), ioBuffer, 0);
    Sentence sen=new Sentence(ioBuffer,offset-cb.length());
    list.add(sen);
    cb.setLength(0);
    }
    offset++;
    }
    return list;
    }

    //将相连的单个英文或数字组合成词
    public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
    {
    ArrayList<Token> tokenlist=new ArrayList<Token>();
    Token word=null;
    for(int i=0;i<list.size();i++)
    {
    Token t=list.get(i);
    if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
    {
    if(word==null)
    word=t;
    else if(word.getEnd()==t.getStart())
    {
    word.setEnd(t.getEnd());
    word.setWord(word.getWord()+t.getWord());
    }
    else
    {
    tokenlist.add(word);
    word=t;
    }
    }
    else if(word!=null)
    {
    tokenlist.add(word);
    word=null;
    tokenlist.add(t);
    }
    else
    tokenlist.add(t);
    }
    if(word!=null)
    tokenlist.add(word);
    return tokenlist;
    }

    //双角转单角
    public static int toAscii(int codePoint)
    {
    if((codePoint>=65296 && codePoint<=65305) //0-9
    || (codePoint>=65313 && codePoint<=65338) //A-Z
    || (codePoint>=65345 && codePoint<=65370) //a-z
    )
    {
    codePoint -= 65248;
    }
    return codePoint;
    }
    }


  • 相关阅读:
    Warning: 执行完毕, 但带有警告 trigger trigger_EqPic_insert 已编译。
    c#生成cad缩略图或者图片
    ORACLE ROWNUM解析[转]
    集合已修改;可能无法执行枚举操作。
    JS 变量是否有值的判断
    简单方法解决bootstrap3 modal异步加载只一次的问题
    System.Data.DbType映射关系
    sql zhuan ORACLE
    Enterprise Library
    sql server转oracle需要注意的几点
  • 原文地址:https://www.cnblogs.com/ibook360/p/2245871.html
Copyright © 2011-2022 走看看