Atitit.atiInputMethod v2词库清理策略工具 q229
1.1. Foreigncode 外码清理
Only can Ascii and num
Before Be4
For 4
1.2. 垃圾词澄清
Tool long
Not common wordlib (3000-5000)
作者:: 绰号:老哇的爪子 ( 全名::Attilax Akbar Al Rapanui 阿提拉克斯 阿克巴 阿尔 拉帕努伊 ) 汉字名:艾龙, EMAIL:1466519819@qq.com
转载请注明来源: http://blog.csdn.net/attilax
1.3. Code---
package com.attilax.inputmethod;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.lang3.StringUtils;
import com.attilax.io.FileLineTraver;
import com.attilax.io.filex;
import com.attilax.lang.text.strUtil;
import com.google.common.collect.Lists;
public class WordlibClrNTrim {
public static void main(String[] args) throws IOException {
String expWordlibDir = "C:\\Users\\Administrator.ATTILAXPC188\\Documents\\[ atian inputmethod ]\\";
expWordlibDir = "C:\\Users\\Administrator\\Documents\\[ atian attilax ]\\";
String strPath = expWordlibDir + "导出 - 全部词条.txt";
if (!new File(strPath).exists())
throw new RuntimeException("file not exist");
String fileName = "c:\\wordlib" + filex.getUUidName() + ".txt";
System.out.println(fileName);
String encode = "unicode";
encode = "utf-8";
filex fc = new filex(fileName, encode);
FileLineTraver.trav(strPath, encode, (line) -> {
String[] a = line.toString().split("\t");
if (a.length < 2)
return line;
String forighCodePix = a[1];
String[] forighCode_arr = get_forighCode_arr(forighCodePix);
String forighCode = forighCode_arr[0].trim();
if (forighCode.length() == 0
|| !StringUtils.isAlphanumeric(forighCode)) // err
// foreigh
// code
{
System.out.println(line);
return "";
}
// new line
String Char = a[0];
String newLine = Char + "\t" + forighCode.trim()
+ forighCode_arr[1].trim();
fc.append_HP_Safe(newLine + "\r\n");
return line;
});
fc.closeSF();
System.out.println("--f");
}
private static String[] get_forighCode_arr(String forighCodePix) {
int SharpCharINdex = forighCodePix.indexOf("#");
String[] a = { "", "" };
a[0] = forighCodePix.substring(0, SharpCharINdex);
a[1] = forighCodePix.substring(SharpCharINdex);
return a;
}
}