zoukankan      html  css  js  c++  java
  • Java 解析chm文件实战(原创)

     

    需求:java解析chm文件,并将内容插入数据库和redis.

    Java解析chm文件,网上除了github上有个家伙只言片语了一下,没有啥资料参考,包括chm4j这东西,没啥介绍,本着服务大众的精神,整理了下流程, 时间仓促,错误之处在所难免,望指正.

     

    第一步:下载chm4j.jar以及依赖

    http://sourceforge.net/projects/chm4j/

     

    第二步:新建java工程,建一个解析ParseChm类,建一个解析测试类,类似:

    ParseChm类:

    //下面的包,请导入chm4j.jar,并且把chm4j.dll拷贝到jre的lib目录内,linux或mac请拷贝libchm4j.so即//可,因为chm4j.jar依赖于c++

     

    package cn.lswe.baseframe.utils;

     

    import java.io.File;

    import java.io.FileOutputStream;

    import java.io.IOException;

    import java.io.InputStream;

    import java.io.OutputStream;

     

    import org.chm4j.*;

     

    import cn.lswe.baseframe.validator.Conf;

     

    public class ParseChm {

              public static void main(String... args) {

        

            try {

                ChmFile cFile = new ChmFile(Conf.ChmSOurce);

                String dir = Conf.dir;

                ChmEntry.Attribute attributes = ChmEntry.Attribute.ALL;

                ChmEntry[] entries = cFile.entries(attributes);

                for (ChmEntry entry : entries) {

                    listChmEntry(dir, entry, attributes);

                }

            } catch (IOException ex) {

                System.out.println("Error : " + ex.getMessage());

            }

        }

     

        /**

         * Extracts recursively the sub entries of the specified ChmEntry into the

         * specified output directory according to the specified attributes.

         * @param output The output directory.

         * @param entry

         * @param attributes

         * @throws java.io.IOException If an I/O error occurs.

         */

        private static void listChmEntry(String output, ChmEntry entry, ChmEntry.Attribute attributes) throws IOException {

            printEntry(entry);

            String er=GuidHelper.CreateGuid().toString();

            File dest = new File(output, entry.getPath());

          

            if (entry.hasAttribute(ChmEntry.Attribute.DIRECTORY)) {

                if (!dest.isDirectory()) {

                    if (!dest.mkdirs()) {

                        throw new IOException("failed to create directory : " + dest);

                    }

                }

                for (ChmEntry e : entry.entries(attributes)) {

                    listChmEntry(output, e, attributes);

                }

            } else {

                InputStream in = null;

                OutputStream out = null;

                try {

                    in = entry.getInputStream();

                    out = new FileOutputStream(dest);

                    int bufferSize = 1024;

                    byte[] data = new byte[bufferSize];

                    int nbRead;

                    while ((nbRead = in.read(data)) > 0) {

                        out.write(data, 0, nbRead);

                        out.flush();

                    }

                } catch (IOException ex) {

                    System.out.println(ex.getMessage());

                } finally {

                    try {

                        if (out != null) {

                            out.close();

                        }

                    } finally {

                        if (in != null) {

                            in.close();

                        }

                    }

                }

            }

        }

     

        /**

         * Display the specified entry.

         * @param entry

         */

        private static void printEntry(ChmEntry entry) {

            StringBuilder sb = new StringBuilder("Extract entry " + entry + "(");

            boolean first = true;

            for (ChmEntry.Attribute attribute : entry.getAttributes()) {

                if (first) {

                    first = false;

                } else {

                    sb.append(", ");

                }

                sb.append(attribute);

            }

            sb.append(")");

            System.out.println(sb.toString());

    }

    }

    这就得到了若干中转html文件(两万多个),注意,chm文件的格式相当复杂,决定了这样的处理方法,事实上我接下来的word,freemind文件,统统都这样处理的.

    测试类:

    package cn.lswe.baseframe.spider;

    import java.io.FileNotFoundException;

    import java.util.LinkedList;

    import java.util.List;

     

    import cn.lswe.baseframe.utils.FileHelper;

    import cn.lswe.baseframe.validator.Conf;

    import us.codecraft.webmagic.Page;

    import us.codecraft.webmagic.Site;

    import us.codecraft.webmagic.Spider;

    import us.codecraft.webmagic.pipeline.ConsolePipeline;

     

    import us.codecraft.webmagic.processor.PageProcessor;

     

    public class ParseDisease implements PageProcessor {

     

             private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

     

        @Override

        public void process(Page page) {

        List<String>links=new LinkedList<String>();

        links.add(Conf.diseaseDataSource);

       

            page.addTargetRequests(links);

            List<String> tdList = page.getHtml().xpath("table").xpath("td").all();

    //这里利用了webmagic爬虫框架,可以参照这个链接做:

    // http://webmagic.io/docs/zh/

     

            int j=0;

            int k=0;

            for(int i=0;i<tdList.size();i++){

    //在这里,可以筛选,处理你的内容了

    //插入jedis,插入数据库都ok

            j=i+1;

            k=i+2;

            String td1=tdList.get(i);

            String td2=tdList.get(i);//第二列要拆分为一个数组,它是第一列的下一级分类

            String td3=tdList.get(i);//如果第一列是字母,第三列和第二列一一对应,如果第一列是汉字,第三列和第二列第二行开始一一对应,其编码是”B”+第一行第三列+本行第三列

            }

                 System.out.println(tdList);

        }

     

        @Override

        public Site getSite() {

            return site;

        }

     

     

             @SuppressWarnings("deprecation")

             public static void testSpider() {

             // Conf.diseaseDataSource 嗅探的起点,比如www.ask.com,为了速度,请把所有资源文件部署

    //localhost

         Spider.create(new ParseDisease())

                       .addUrl(Conf.diseaseDataSource)

             .pipeline(new  ConsolePipeline()).thread(5).run();

             //开启5个线程抓取

        

        }

    }.

     

    调用方法:

     

        @ResponseBody

        @RequestMapping("/test/spider")

        public void spider()

        {

            //OschinaBlogPageProcesser.testSpider();

           

            ParseChm.testSpider();

        }

       

     

     

  • 相关阅读:
    解决urbuntu桌面本客户端输入ll command not found
    小白学习安全测试(二)——httrack的安装和使用
    Selenium + java不借助autolt实现下载文件到指定目录
    用例设计工具PICT — 输入组合覆盖
    解决创建maven项目Could not resolve archetype org.apache.maven.archetypes:maven-archetype-quickstart问题
    作死的自动化测试【转】
    测试开发是什么?为什么现在那么多公司都要招聘测试开发?【转】
    MySql的触发器
    MySql的存储过程
    MySql的索引操作
  • 原文地址:https://www.cnblogs.com/aobama/p/4782901.html
Copyright © 2011-2022 走看看