zoukankan      html  css  js  c++  java
  • 使用Java将搜狗词库文件(文件后缀为.scel)转为.txt文件

    要做一个根据词库进行筛选主要词汇的功能,去搜狗下载专业词汇词库时,发现是.scel文件,且通过转换工具(http://tools.bugscaner.com/sceltotxt/)转换为txt时报错如下,只能通过Java程序来转换了。

    核心代码如下,涉及到四个类:FileProcessing、SougouScelFileProcessing、SougouScelModel、TxtFileProcessing

    文件FileProcessing .java

    package cn.ucmed.impl;
    
    import java.io.File;
    import java.io.IOException;
    
    public abstract class FileProcessing {
        protected String targetDir;
    
        /**
         * 解析单个文件
         *
         * @param filePath       要解析的源文件路径
         * @param targetFilePath 解析后的文件路径
         * @param isAppend       是否为内容追加,不追加则会覆盖内容
         */
        public abstract void parseFile(String filePath, String targetFilePath, boolean isAppend);
    
        /**
         * 合并解析多个文件
         *
         * @param fileDirPath    要解析的源文件夹路径
         * @param targetFilePath 解析后的文件路径
         * @param isAppend       是否为内容追加,不追加则会覆盖内容
         * @throws IOException
         */
        public abstract void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException;
    
        /**
         * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)},
         * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件
         *
         * @param filePath 源文件路径
         * @param isAppend false:覆盖内容 true:附加内容
         */
        public abstract void parseFile(String filePath, boolean isAppend);
    
        /**
         * 创建文件夹
         *
         * @param targetFilePath 目标文件
         * @return
         */
        protected void createParentDir(String targetFilePath) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            String path = targetFilePath.substring(0, targetFilePath.lastIndexOf("/") + 1);
            File file = new File(path);
            if (!file.exists()) {
                file.mkdirs();
            }
        }
    
        /**
         * 解析单个文件
         *
         * @param filePath 文件路径
         */
        public void parseFile(String filePath) {
            parseFile(filePath, false);
        }
    
        public String getTargetDir() {
            return targetDir;
        }
    
        /**
         * 解析后的txt文件存放路径
         *
         * @param targetDir 文件夹路径
         */
        public void setTargetDir(String targetDir) {
            this.targetDir = targetDir;
        }
    }
    SougouScelFileProcessing.java
    package cn.ucmed.impl;
    
    import lombok.extern.slf4j.Slf4j;
    
    import java.io.*;
    import java.util.*;
    
    @Slf4j
    public class SougouScelFileProcessing extends FileProcessing {
        protected static String encoding = "UTF-16LE";
        protected ByteArrayOutputStream output = new ByteArrayOutputStream();
    
        /**
         * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)},
         * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件
         *
         * @param filePath 源文件路径
         * @param isAppend false:覆盖内容 true:附加内容
         */
        @Override
        public void parseFile(String filePath, boolean isAppend) {
            File file = new File(filePath);
            if (file.isDirectory()) {
                File items[] = file.listFiles();
                for (int i = 0; i < items.length; i++) {
                    if (!items[i].getName().endsWith(".scel")) {
                        continue;
                    }
    
                    if (targetDir == null) {
                        parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".scel", ".txt"),
                                isAppend);
                    } else {
                        parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName().replace(".scel", ".txt"),
                                isAppend);
                    }
    
                }
            } else {
                parseFile(filePath, file.getAbsolutePath().replace(".scel", ".txt"), isAppend);
            }
    
        }
    
        /**
         * 解析单个scel文件
         *
         * @param filePath       源文件路径
         * @param targetFilePath
         * @param isAppend       false:覆盖内容 true:附加内容
         */
        @Override
        public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            if (!filePath.endsWith(".scel")) {
                return;
            }
            File input = new File(filePath);
            if (input.length() < 8) {
                // 假如文件小于8字节,不去考虑它
                return;
            }
            FileInputStream in = null;
            SougouScelModel model = null;
            try {
                in = new FileInputStream(input);
                model = read(in);
                if (model == null) {
                    return;
                }
                writeToTargetFile(model, targetFilePath, isAppend);
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
    
        }
    
        /**
         * 解析多个文件夹,将解析后的内容放到一个文件里
         *
         * @param fileDirPath    源文件夹路径
         * @param targetFilePath 目标文件路径
         * @param isAppend       false:覆盖内容 true:附加内容
         * @throws FileNotFoundException
         */
        @Override
        public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            File dir = new File(fileDirPath);
            if (!dir.exists() || !dir.isDirectory()) {
                throw new IllegalStateException("scel文件夹路径错误   " + targetFilePath);
            }
            File scels[] = dir.listFiles();
            ArrayList<SougouScelModel> models = new ArrayList<>();
            for (int i = 0; i < scels.length; i++) {
                if (!scels[i].getName().endsWith(".scel")) {
                    continue;
                }
                FileInputStream in = null;
                SougouScelModel model = null;
                in = new FileInputStream(scels[i]);
                model = read(in);
                if (model != null) {
                    models.add(model);
                }
            }
            writeToTargetFile(models, targetFilePath, isAppend);
        }
    
        private void writeToTargetFile(SougouScelModel model, String targetFilePath, boolean isAppend) throws IOException {
            List<SougouScelModel> models = new ArrayList<>();
            models.add(model);
            writeToTargetFile(models, targetFilePath, isAppend);
    
        }
    
        /**
         * 将搜狗scel文件解析后的内容写入txt文件
         *
         * @param models
         * @param targetFilePath
         * @param isAppend
         * @throws IOException
         */
        private void writeToTargetFile(List<SougouScelModel> models, String targetFilePath, boolean isAppend)
                throws IOException {
            createParentDir(targetFilePath);
            FileOutputStream out = new FileOutputStream(targetFilePath, isAppend);
            int count = 0;
            for (int k = 0; k < models.size(); k++) {
                // 词<拼音,词>
                Map<String, List<String>> words = models.get(k).getWordMap();
                Set<Map.Entry<String, List<String>>> set = words.entrySet();
                Iterator<Map.Entry<String, List<String>>> iter = set.iterator();
                if (isAppend) {
                    out.write("
    ".getBytes());
                }
                while (iter.hasNext()) {
                    Map.Entry<String, List<String>> entry = iter.next();
                    List<String> list = entry.getValue();
    
                    int size = list.size();
                    for (int i = 0; i < size; i++) {
                        String word = list.get(i);
                        out.write((entry.getKey() + " ").getBytes());
                        // 写入txt文件
                        out.write((word + "
    ").getBytes());
                        count++;
    
                    }
                }
    
            }
            out.close();
            log.info("生成" + targetFilePath.substring(targetFilePath.lastIndexOf("/") + 1) + "成功!,总计写入: " + count + " 条数据!");
    
        }
    
        private SougouScelModel read(InputStream in) {
            SougouScelModel model = new SougouScelModel();
            DataInputStream input = new DataInputStream(in);
            int read;
            try {
                byte[] bytes = new byte[4];
                input.readFully(bytes);
                assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
                input.readFully(bytes);
                int flag1 = bytes[0];
                assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
                int[] reads = new int[]{8};
                model.setName(readString(input, 0x130, reads));
                model.setType(readString(input, 0x338, reads));
                model.setDescription(readString(input, 0x540, reads));
                model.setSample(readString(input, 0xd40, reads));
                read = reads[0];
                input.skip(0x1540 - read);
                read = 0x1540;
                input.readFully(bytes);
                read += 4;
                assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
                bytes = new byte[128];
                Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
                while (true) {
                    int mark = readUnsignedShort(input);
                    int size = input.readUnsignedByte();
                    input.skip(1);
                    read += 4;
                    assert (size > 0 && (size % 2) == 0);
                    input.readFully(bytes, 0, size);
                    read += size;
                    String py = new String(bytes, 0, size, encoding);
                    pyMap.put(mark, py);
                    if ("zuo".equals(py)) {
                        break;
                    }
                }
                if (flag1 == 0x44) {
                    input.skip(0x2628 - read);
                } else if (flag1 == 0x45) {
                    input.skip(0x26C4 - read);
                } else {
                    throw new RuntimeException("出现意外,联系作者");
                }
                StringBuffer buffer = new StringBuffer();
                Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
                while (true) {
                    int size = readUnsignedShort(input);
                    if (size < 0) {
                        break;
                    }
                    int count = readUnsignedShort(input);
                    int len = count / 2;
                    assert (len * 2 == count);
                    buffer.setLength(0);
                    for (int i = 0; i < len; i++) {
                        int key = readUnsignedShort(input);
                        buffer.append(pyMap.get(key)).append("'");
                    }
                    buffer.setLength(buffer.length() - 1);
                    String py = buffer.toString();
                    List<String> list = wordMap.get(py);
                    if (list == null) {
                        list = new ArrayList<String>();
                        wordMap.put(py, list);
                    }
                    for (int i = 0; i < size; i++) {
                        count = readUnsignedShort(input);
                        if (count > bytes.length) {
                            bytes = new byte[count];
                        }
                        input.readFully(bytes, 0, count);
                        String word = new String(bytes, 0, count, encoding);
                        // 接下来12个字节可能是词频或者类似信息
                        input.skip(12);
                        list.add(word);
                    }
                }
                model.setWordMap(wordMap);
                return model;
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return null;
        }
    
        protected String readString(DataInputStream input, int pos, int[] reads) throws IOException {
            int read = reads[0];
            input.skip(pos - read);
            read = pos;
            output.reset();
            while (true) {
                int c1 = input.read();
                int c2 = input.read();
                read += 2;
                if (c1 == 0 && c2 == 0) {
                    break;
                } else {
                    output.write(c1);
                    output.write(c2);
                }
            }
            reads[0] = read;
            return new String(output.toByteArray(), encoding);
        }
    
        protected final int readUnsignedShort(InputStream in) throws IOException {
            int ch1 = in.read();
            int ch2 = in.read();
            if ((ch1 | ch2) < 0) {
                return Integer.MIN_VALUE;
            }
            return (ch2 << 8) + (ch1 << 0);
        }
    }
    SougouScelModel.java
    package cn.ucmed.impl;
    
    import lombok.Data;
    import lombok.ToString;
    
    import java.util.List;
    import java.util.Map;
    
    @Data
    @ToString
    public class SougouScelModel {
        private Map<String, List<String>> wordMap;
        private String name;
        private String type;
        private String description;
        private String sample;
    }
    TxtFileProcessing.java
    package cn.ucmed.impl;
    
    import lombok.extern.slf4j.Slf4j;
    
    import java.io.*;
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.List;
    
    @Slf4j
    public class TxtFileProcessing extends FileProcessing {
    
        // 文字编码
        private String encoding = "UTF-8";
    
        @Override
        public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            if (!filePath.endsWith(".txt")) {
                return;
            }
            File inputFile = new File(filePath);
            if (!inputFile.exists()) {
                log.info(filePath + "   文件不存在");
            } else {
                ArrayList<String> content = new ArrayList<>();
                HashSet<String> set = new HashSet<>();
                createParentDir(targetFilePath);
                File outputFile = new File(targetFilePath);
                if (!isAppend) {
                    // 假如不是附加内容,删除
                    if (outputFile.exists()) {
                        log.info(outputFile.getAbsolutePath() + "   文件存在,删除...");
                        outputFile.delete();
                    }
                } else {
                    // 读取原有的txt文件内容
                    content.addAll(readTargetFile(outputFile));
                }
                content.addAll(readSourceFile(inputFile));
                // 去重
                for (int i = 0; i < content.size(); i++) {
                    set.add(content.get(i));
                }
                // 写入目标文件
                writeToTargetFile(set, outputFile);
    
            }
    
        }
    
        @Override
        public void parseFile(String filePath, boolean isAppend) {
            File file = new File(filePath);
            if (file.isDirectory()) {
                File items[] = file.listFiles();
                for (int i = 0; i < items.length; i++) {
                    if (!items[i].getName().endsWith(".txt")) {
                        continue;
                    }
    
                    if (targetDir == null) {
                        parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".txt", "解析.txt"),
                                isAppend);
                    } else {
                        parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName(),
                                isAppend);
                    }
    
                }
            } else {
                parseFile(filePath, file.getAbsolutePath().replace(".txt", "解析.txt"), isAppend);
            }
        }
    
        @Override
        public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            File fileDir = new File(fileDirPath);
            if (!fileDir.isDirectory() || !fileDir.exists()) {
                throw new IllegalStateException("文件夹路径错误   " + targetFilePath);
            }
            File file[] = fileDir.listFiles();
            ArrayList<String> content = new ArrayList<>();
            HashSet<String> set = new HashSet<>();
            createParentDir(targetFilePath);
            File outputFile = new File(targetFilePath);
            if (!isAppend) {
                // 假如不是附加内容,删除
                if (outputFile.exists()) {
                    log.info(outputFile.getAbsolutePath() + "   文件存在,删除...");
                    outputFile.delete();
                }
            } else {
                // 读取原有的txt文件内容
                content.addAll(readSourceFile(outputFile));
            }
            for (int i = 0; i < file.length; i++) {
                if (file[i].getName().endsWith(".txt")) {
                    content.addAll(readSourceFile(file[i]));
                }
            }
            // 去重
            for (int i = 0; i < content.size(); i++) {
                set.add(content.get(i));
            }
            // 写入目标文件
            writeToTargetFile(set, outputFile);
    
        }
    
        /**
         * 将内容写入目标文件
         *
         * @param set        词库合集
         * @param outputFile 目标文件
         */
        private void writeToTargetFile(HashSet<String> set, File outputFile) {
            StringBuffer buff = new StringBuffer();
            for (String content : set) {
                buff.append(content);
                buff.append("
    ");
            }
            String content = buff.toString();
    
            FileOutputStream out = null;
            try {
                out = new FileOutputStream(outputFile);
                out.write(content.getBytes());
    
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            } finally {
                try {
                    out.close();
                } catch (IOException e) {
                    log.info(e.getMessage());
                    e.printStackTrace();
                }
            }
            log.info("生成" + outputFile.getName() + "成功!,总计写入: " + set.size() + " 条数据!");
        }
    
    
        /**
         * 读取源文件,获取中文词库
         *
         * @param file 源文件
         * @return 中文词库集合
         */
        private List<String> readSourceFile(File file) {
            ArrayList<String> content = new ArrayList<>();
            try {
                // 判断文件是否存在
                if (file.isFile() && file.exists()) {
                    // 考虑到编码格式
                    InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String lineTxt = null;
    
                    while ((lineTxt = bufferedReader.readLine()) != null) {
                        String newStr = new String(lineTxt.getBytes("UTF-8"));
                        String split[] = newStr.split(" ");
                        for (int i = 0; i < split.length; i++) {
                            if (i % 2 == 0) {
                                // 拼音字母
                            } else {
                                // 中文词库
                                content.add(split[i]);
                            }
                        }
                    }
                    bufferedReader.close();
                    read.close();
                } else {
                    log.info("找不到源文件   " + file.getAbsolutePath());
                }
            } catch (Exception e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
            return content;
    
        }
    
        /**
         * 读取已解析好的的词库文件
         *
         * @param file 词库文件
         * @return 词库内容
         */
        private List<String> readTargetFile(File file) {
            ArrayList<String> content = new ArrayList<>();
            try {
                // 判断文件是否存在
                if (file.isFile() && file.exists()) {
                    // 考虑到编码格式
                    InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String lineTxt = null;
    
                    while ((lineTxt = bufferedReader.readLine()) != null) {
                        String newStr = new String(lineTxt.getBytes("UTF-8"));
                        if (!newStr.trim().isEmpty()) {
                            content.add(newStr);
                        }
                    }
                    bufferedReader.close();
                    read.close();
                } else {
                    System.err.println("找不到目标文件  " + file.getAbsolutePath());
                }
            } catch (Exception e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
            return content;
        }
    }

    测试用例:

    public static void main(String[] args) {
            //单个scel文件转化
            FileProcessing scel = new SougouScelFileProcessing();
            scel.parseFile("./resolver/src/main/java/cn/ucmed/constant/药品名称大全.scel", "./resolver/src/main/java/cn/ucmed/constant/药品名称大全.txt", true);
    
            //多个scel文件转化为一个txt (格式:拼音字母 词)
            try {
                scel.parseFiles("/Users/ST_iOS/Desktop/test/ciku", "/Users/ST_iOS/Desktop/test/ciku/txt/汇总.txt", false);
            } catch (IOException e) {
                e.printStackTrace();
            }
            //多个scel文件转化为多个txt文件, 转化后文件的存储位置
            scel.setTargetDir("/Users/ST_iOS/Desktop/test/ciku/多对多");
            scel.parseFile("/Users/ST_iOS/Desktop/test/ciku", false);
        }

    本文转自:https://blog.csdn.net/imhxl/article/details/52585968

  • 相关阅读:
    Mysql登录错误:ERROR 1045 (28000): Plugin caching_sha2_password could not be loaded
    Docker配置LNMP环境
    Docker安装mysqli扩展和gd扩展
    Docker常用命令
    Ubuntu常用命令
    单例模式的优缺点和使用场景
    ABP 多租户数据共享
    ABP Core 后台Angular+Ng-Zorro 图片上传
    ERROR Error: If ngModel is used within a form tag, either the name attribute must be set or the form control must be defined as 'standalone' in ngModelOptions.
    AbpCore 执行迁移文件生成数据库报错 Could not find root folder of the web project!
  • 原文地址:https://www.cnblogs.com/miaoying/p/11573982.html
Copyright © 2011-2022 走看看