zoukankan      html  css  js  c++  java
  • 使用Java将搜狗词库文件(文件后缀为.scel)转为.txt文件

    要做一个根据词库进行筛选主要词汇的功能,去搜狗下载专业词汇词库时,发现是.scel文件,且通过转换工具(http://tools.bugscaner.com/sceltotxt/)转换为txt时报错如下,只能通过Java程序来转换了。

    核心代码如下,涉及到四个类:FileProcessing、SougouScelFileProcessing、SougouScelModel、TxtFileProcessing

    文件FileProcessing .java

    package cn.ucmed.impl;
    
    import java.io.File;
    import java.io.IOException;
    
    public abstract class FileProcessing {
        protected String targetDir;
    
        /**
         * 解析单个文件
         *
         * @param filePath       要解析的源文件路径
         * @param targetFilePath 解析后的文件路径
         * @param isAppend       是否为内容追加,不追加则会覆盖内容
         */
        public abstract void parseFile(String filePath, String targetFilePath, boolean isAppend);
    
        /**
         * 合并解析多个文件
         *
         * @param fileDirPath    要解析的源文件夹路径
         * @param targetFilePath 解析后的文件路径
         * @param isAppend       是否为内容追加,不追加则会覆盖内容
         * @throws IOException
         */
        public abstract void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException;
    
        /**
         * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)},
         * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件
         *
         * @param filePath 源文件路径
         * @param isAppend false:覆盖内容 true:附加内容
         */
        public abstract void parseFile(String filePath, boolean isAppend);
    
        /**
         * 创建文件夹
         *
         * @param targetFilePath 目标文件
         * @return
         */
        protected void createParentDir(String targetFilePath) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            String path = targetFilePath.substring(0, targetFilePath.lastIndexOf("/") + 1);
            File file = new File(path);
            if (!file.exists()) {
                file.mkdirs();
            }
        }
    
        /**
         * 解析单个文件
         *
         * @param filePath 文件路径
         */
        public void parseFile(String filePath) {
            parseFile(filePath, false);
        }
    
        public String getTargetDir() {
            return targetDir;
        }
    
        /**
         * 解析后的txt文件存放路径
         *
         * @param targetDir 文件夹路径
         */
        public void setTargetDir(String targetDir) {
            this.targetDir = targetDir;
        }
    }
    SougouScelFileProcessing.java
    package cn.ucmed.impl;
    
    import lombok.extern.slf4j.Slf4j;
    
    import java.io.*;
    import java.util.*;
    
    @Slf4j
    public class SougouScelFileProcessing extends FileProcessing {
        protected static String encoding = "UTF-16LE";
        protected ByteArrayOutputStream output = new ByteArrayOutputStream();
    
        /**
         * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)},
         * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件
         *
         * @param filePath 源文件路径
         * @param isAppend false:覆盖内容 true:附加内容
         */
        @Override
        public void parseFile(String filePath, boolean isAppend) {
            File file = new File(filePath);
            if (file.isDirectory()) {
                File items[] = file.listFiles();
                for (int i = 0; i < items.length; i++) {
                    if (!items[i].getName().endsWith(".scel")) {
                        continue;
                    }
    
                    if (targetDir == null) {
                        parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".scel", ".txt"),
                                isAppend);
                    } else {
                        parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName().replace(".scel", ".txt"),
                                isAppend);
                    }
    
                }
            } else {
                parseFile(filePath, file.getAbsolutePath().replace(".scel", ".txt"), isAppend);
            }
    
        }
    
        /**
         * 解析单个scel文件
         *
         * @param filePath       源文件路径
         * @param targetFilePath
         * @param isAppend       false:覆盖内容 true:附加内容
         */
        @Override
        public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            if (!filePath.endsWith(".scel")) {
                return;
            }
            File input = new File(filePath);
            if (input.length() < 8) {
                // 假如文件小于8字节,不去考虑它
                return;
            }
            FileInputStream in = null;
            SougouScelModel model = null;
            try {
                in = new FileInputStream(input);
                model = read(in);
                if (model == null) {
                    return;
                }
                writeToTargetFile(model, targetFilePath, isAppend);
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
    
        }
    
        /**
         * 解析多个文件夹,将解析后的内容放到一个文件里
         *
         * @param fileDirPath    源文件夹路径
         * @param targetFilePath 目标文件路径
         * @param isAppend       false:覆盖内容 true:附加内容
         * @throws FileNotFoundException
         */
        @Override
        public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            File dir = new File(fileDirPath);
            if (!dir.exists() || !dir.isDirectory()) {
                throw new IllegalStateException("scel文件夹路径错误   " + targetFilePath);
            }
            File scels[] = dir.listFiles();
            ArrayList<SougouScelModel> models = new ArrayList<>();
            for (int i = 0; i < scels.length; i++) {
                if (!scels[i].getName().endsWith(".scel")) {
                    continue;
                }
                FileInputStream in = null;
                SougouScelModel model = null;
                in = new FileInputStream(scels[i]);
                model = read(in);
                if (model != null) {
                    models.add(model);
                }
            }
            writeToTargetFile(models, targetFilePath, isAppend);
        }
    
        private void writeToTargetFile(SougouScelModel model, String targetFilePath, boolean isAppend) throws IOException {
            List<SougouScelModel> models = new ArrayList<>();
            models.add(model);
            writeToTargetFile(models, targetFilePath, isAppend);
    
        }
    
        /**
         * 将搜狗scel文件解析后的内容写入txt文件
         *
         * @param models
         * @param targetFilePath
         * @param isAppend
         * @throws IOException
         */
        private void writeToTargetFile(List<SougouScelModel> models, String targetFilePath, boolean isAppend)
                throws IOException {
            createParentDir(targetFilePath);
            FileOutputStream out = new FileOutputStream(targetFilePath, isAppend);
            int count = 0;
            for (int k = 0; k < models.size(); k++) {
                // 词<拼音,词>
                Map<String, List<String>> words = models.get(k).getWordMap();
                Set<Map.Entry<String, List<String>>> set = words.entrySet();
                Iterator<Map.Entry<String, List<String>>> iter = set.iterator();
                if (isAppend) {
                    out.write("
    ".getBytes());
                }
                while (iter.hasNext()) {
                    Map.Entry<String, List<String>> entry = iter.next();
                    List<String> list = entry.getValue();
    
                    int size = list.size();
                    for (int i = 0; i < size; i++) {
                        String word = list.get(i);
                        out.write((entry.getKey() + " ").getBytes());
                        // 写入txt文件
                        out.write((word + "
    ").getBytes());
                        count++;
    
                    }
                }
    
            }
            out.close();
            log.info("生成" + targetFilePath.substring(targetFilePath.lastIndexOf("/") + 1) + "成功!,总计写入: " + count + " 条数据!");
    
        }
    
        private SougouScelModel read(InputStream in) {
            SougouScelModel model = new SougouScelModel();
            DataInputStream input = new DataInputStream(in);
            int read;
            try {
                byte[] bytes = new byte[4];
                input.readFully(bytes);
                assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
                input.readFully(bytes);
                int flag1 = bytes[0];
                assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
                int[] reads = new int[]{8};
                model.setName(readString(input, 0x130, reads));
                model.setType(readString(input, 0x338, reads));
                model.setDescription(readString(input, 0x540, reads));
                model.setSample(readString(input, 0xd40, reads));
                read = reads[0];
                input.skip(0x1540 - read);
                read = 0x1540;
                input.readFully(bytes);
                read += 4;
                assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
                bytes = new byte[128];
                Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
                while (true) {
                    int mark = readUnsignedShort(input);
                    int size = input.readUnsignedByte();
                    input.skip(1);
                    read += 4;
                    assert (size > 0 && (size % 2) == 0);
                    input.readFully(bytes, 0, size);
                    read += size;
                    String py = new String(bytes, 0, size, encoding);
                    pyMap.put(mark, py);
                    if ("zuo".equals(py)) {
                        break;
                    }
                }
                if (flag1 == 0x44) {
                    input.skip(0x2628 - read);
                } else if (flag1 == 0x45) {
                    input.skip(0x26C4 - read);
                } else {
                    throw new RuntimeException("出现意外,联系作者");
                }
                StringBuffer buffer = new StringBuffer();
                Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
                while (true) {
                    int size = readUnsignedShort(input);
                    if (size < 0) {
                        break;
                    }
                    int count = readUnsignedShort(input);
                    int len = count / 2;
                    assert (len * 2 == count);
                    buffer.setLength(0);
                    for (int i = 0; i < len; i++) {
                        int key = readUnsignedShort(input);
                        buffer.append(pyMap.get(key)).append("'");
                    }
                    buffer.setLength(buffer.length() - 1);
                    String py = buffer.toString();
                    List<String> list = wordMap.get(py);
                    if (list == null) {
                        list = new ArrayList<String>();
                        wordMap.put(py, list);
                    }
                    for (int i = 0; i < size; i++) {
                        count = readUnsignedShort(input);
                        if (count > bytes.length) {
                            bytes = new byte[count];
                        }
                        input.readFully(bytes, 0, count);
                        String word = new String(bytes, 0, count, encoding);
                        // 接下来12个字节可能是词频或者类似信息
                        input.skip(12);
                        list.add(word);
                    }
                }
                model.setWordMap(wordMap);
                return model;
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return null;
        }
    
        protected String readString(DataInputStream input, int pos, int[] reads) throws IOException {
            int read = reads[0];
            input.skip(pos - read);
            read = pos;
            output.reset();
            while (true) {
                int c1 = input.read();
                int c2 = input.read();
                read += 2;
                if (c1 == 0 && c2 == 0) {
                    break;
                } else {
                    output.write(c1);
                    output.write(c2);
                }
            }
            reads[0] = read;
            return new String(output.toByteArray(), encoding);
        }
    
        protected final int readUnsignedShort(InputStream in) throws IOException {
            int ch1 = in.read();
            int ch2 = in.read();
            if ((ch1 | ch2) < 0) {
                return Integer.MIN_VALUE;
            }
            return (ch2 << 8) + (ch1 << 0);
        }
    }
    SougouScelModel.java
    package cn.ucmed.impl;
    
    import lombok.Data;
    import lombok.ToString;
    
    import java.util.List;
    import java.util.Map;
    
    @Data
    @ToString
    public class SougouScelModel {
        private Map<String, List<String>> wordMap;
        private String name;
        private String type;
        private String description;
        private String sample;
    }
    TxtFileProcessing.java
    package cn.ucmed.impl;
    
    import lombok.extern.slf4j.Slf4j;
    
    import java.io.*;
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.List;
    
    @Slf4j
    public class TxtFileProcessing extends FileProcessing {
    
        // 文字编码
        private String encoding = "UTF-8";
    
        @Override
        public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            if (!filePath.endsWith(".txt")) {
                return;
            }
            File inputFile = new File(filePath);
            if (!inputFile.exists()) {
                log.info(filePath + "   文件不存在");
            } else {
                ArrayList<String> content = new ArrayList<>();
                HashSet<String> set = new HashSet<>();
                createParentDir(targetFilePath);
                File outputFile = new File(targetFilePath);
                if (!isAppend) {
                    // 假如不是附加内容,删除
                    if (outputFile.exists()) {
                        log.info(outputFile.getAbsolutePath() + "   文件存在,删除...");
                        outputFile.delete();
                    }
                } else {
                    // 读取原有的txt文件内容
                    content.addAll(readTargetFile(outputFile));
                }
                content.addAll(readSourceFile(inputFile));
                // 去重
                for (int i = 0; i < content.size(); i++) {
                    set.add(content.get(i));
                }
                // 写入目标文件
                writeToTargetFile(set, outputFile);
    
            }
    
        }
    
        @Override
        public void parseFile(String filePath, boolean isAppend) {
            File file = new File(filePath);
            if (file.isDirectory()) {
                File items[] = file.listFiles();
                for (int i = 0; i < items.length; i++) {
                    if (!items[i].getName().endsWith(".txt")) {
                        continue;
                    }
    
                    if (targetDir == null) {
                        parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".txt", "解析.txt"),
                                isAppend);
                    } else {
                        parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName(),
                                isAppend);
                    }
    
                }
            } else {
                parseFile(filePath, file.getAbsolutePath().replace(".txt", "解析.txt"), isAppend);
            }
        }
    
        @Override
        public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
            if (!targetFilePath.endsWith(".txt")) {
                throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为   " + targetFilePath);
            }
            File fileDir = new File(fileDirPath);
            if (!fileDir.isDirectory() || !fileDir.exists()) {
                throw new IllegalStateException("文件夹路径错误   " + targetFilePath);
            }
            File file[] = fileDir.listFiles();
            ArrayList<String> content = new ArrayList<>();
            HashSet<String> set = new HashSet<>();
            createParentDir(targetFilePath);
            File outputFile = new File(targetFilePath);
            if (!isAppend) {
                // 假如不是附加内容,删除
                if (outputFile.exists()) {
                    log.info(outputFile.getAbsolutePath() + "   文件存在,删除...");
                    outputFile.delete();
                }
            } else {
                // 读取原有的txt文件内容
                content.addAll(readSourceFile(outputFile));
            }
            for (int i = 0; i < file.length; i++) {
                if (file[i].getName().endsWith(".txt")) {
                    content.addAll(readSourceFile(file[i]));
                }
            }
            // 去重
            for (int i = 0; i < content.size(); i++) {
                set.add(content.get(i));
            }
            // 写入目标文件
            writeToTargetFile(set, outputFile);
    
        }
    
        /**
         * 将内容写入目标文件
         *
         * @param set        词库合集
         * @param outputFile 目标文件
         */
        private void writeToTargetFile(HashSet<String> set, File outputFile) {
            StringBuffer buff = new StringBuffer();
            for (String content : set) {
                buff.append(content);
                buff.append("
    ");
            }
            String content = buff.toString();
    
            FileOutputStream out = null;
            try {
                out = new FileOutputStream(outputFile);
                out.write(content.getBytes());
    
            } catch (IOException e) {
                log.info(e.getMessage());
                e.printStackTrace();
            } finally {
                try {
                    out.close();
                } catch (IOException e) {
                    log.info(e.getMessage());
                    e.printStackTrace();
                }
            }
            log.info("生成" + outputFile.getName() + "成功!,总计写入: " + set.size() + " 条数据!");
        }
    
    
        /**
         * 读取源文件,获取中文词库
         *
         * @param file 源文件
         * @return 中文词库集合
         */
        private List<String> readSourceFile(File file) {
            ArrayList<String> content = new ArrayList<>();
            try {
                // 判断文件是否存在
                if (file.isFile() && file.exists()) {
                    // 考虑到编码格式
                    InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String lineTxt = null;
    
                    while ((lineTxt = bufferedReader.readLine()) != null) {
                        String newStr = new String(lineTxt.getBytes("UTF-8"));
                        String split[] = newStr.split(" ");
                        for (int i = 0; i < split.length; i++) {
                            if (i % 2 == 0) {
                                // 拼音字母
                            } else {
                                // 中文词库
                                content.add(split[i]);
                            }
                        }
                    }
                    bufferedReader.close();
                    read.close();
                } else {
                    log.info("找不到源文件   " + file.getAbsolutePath());
                }
            } catch (Exception e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
            return content;
    
        }
    
        /**
         * 读取已解析好的的词库文件
         *
         * @param file 词库文件
         * @return 词库内容
         */
        private List<String> readTargetFile(File file) {
            ArrayList<String> content = new ArrayList<>();
            try {
                // 判断文件是否存在
                if (file.isFile() && file.exists()) {
                    // 考虑到编码格式
                    InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String lineTxt = null;
    
                    while ((lineTxt = bufferedReader.readLine()) != null) {
                        String newStr = new String(lineTxt.getBytes("UTF-8"));
                        if (!newStr.trim().isEmpty()) {
                            content.add(newStr);
                        }
                    }
                    bufferedReader.close();
                    read.close();
                } else {
                    System.err.println("找不到目标文件  " + file.getAbsolutePath());
                }
            } catch (Exception e) {
                log.info(e.getMessage());
                e.printStackTrace();
            }
            return content;
        }
    }

    测试用例:

    public static void main(String[] args) {
            //单个scel文件转化
            FileProcessing scel = new SougouScelFileProcessing();
            scel.parseFile("./resolver/src/main/java/cn/ucmed/constant/药品名称大全.scel", "./resolver/src/main/java/cn/ucmed/constant/药品名称大全.txt", true);
    
            //多个scel文件转化为一个txt (格式:拼音字母 词)
            try {
                scel.parseFiles("/Users/ST_iOS/Desktop/test/ciku", "/Users/ST_iOS/Desktop/test/ciku/txt/汇总.txt", false);
            } catch (IOException e) {
                e.printStackTrace();
            }
            //多个scel文件转化为多个txt文件, 转化后文件的存储位置
            scel.setTargetDir("/Users/ST_iOS/Desktop/test/ciku/多对多");
            scel.parseFile("/Users/ST_iOS/Desktop/test/ciku", false);
        }

    本文转自:https://blog.csdn.net/imhxl/article/details/52585968

  • 相关阅读:
    数据类型
    java基础
    Codeforces Round #655 (Div. 2) B. Omkar and Last Class of Math(数论)
    Codeforces Round #655 (Div. 2) A. Omkar and Completion(构造)
    LibreOJ
    QT入门-QMainWindow类
    WCF 请求与响应参数大小设置
    Python 代码性能优化技巧
    lists,tuples and sets of Python
    SQL Language
  • 原文地址:https://www.cnblogs.com/miaoying/p/11573982.html
Copyright © 2011-2022 走看看