zoukankan      html  css  js  c++  java
  • PDF抽取文字 C# with Adobe API

    前提是PDF里面是有文字的!

    一次性取得所有页内容:

            /// <summary>
            /// 改进前取得所有页的所有word
            /// </summary>
            /// <param name="pdfFilePath"></param>
            /// <param name="txtDirectoryPath"></param>
            public static void ConvertPdfToTxt(string pdfFilePath, string txtDirectoryPath)
            {
                CAcroAVDoc avDoc = (Acrobat.CAcroAVDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.AVDoc"); //set AVDoc object 
                CAcroPDDoc pdDoc;
                //open the PDF
                if (avDoc.Open(pdfFilePath, ""))
                {
                    pdDoc = (CAcroPDDoc)avDoc.GetPDDoc();
                    Object jsAcroObj = pdDoc.GetJSObject();
                    Type T = jsAcroObj.GetType();
                    object[] saveAsParam = { txtDirectoryPath, "com.adobe.acrobat.accesstext" };
                    T.InvokeMember("saveAs",
                      BindingFlags.InvokeMethod |
                      BindingFlags.Public |
                      BindingFlags.Instance,
                      null, jsAcroObj, saveAsParam);
                    object[] closeDocParam = { true };
                    T.InvokeMember("closeDoc",
                      BindingFlags.InvokeMethod |
                      BindingFlags.Public |
                      BindingFlags.Instance,
                      null, jsAcroObj, closeDocParam);
    
                    if (!avDoc.Close(1)) avDoc.Close(1);
                }
            }

    逐页取出:

            /// <summary>
            /// 改进后取得每一页的所有word
            /// </summary>
            /// <param name="pdDoc"></param>
            /// <returns></returns>
            public static List<KeyValuePair<String, String>> PdDocGetText(AcroPDDoc pdDoc)
            {
                List<KeyValuePair<String, String>> txt = new List<KeyValuePair<string, string>>();
                AcroPDPage page;
                int pages = pdDoc.GetNumPages();
                string pageText = "";
                for (int i = 0; i < pages; i++)
                {
                    page = (AcroPDPage)pdDoc.AcquirePage(i);
                    object jso, jsNumWords, jsWord;
                    List<string> words = new List<string>();
                    try
                    {
                        jso = pdDoc.GetJSObject();
                        if (jso != null)
                        {
                            object[] args = new object[] { i };
                            jsNumWords = jso.GetType().InvokeMember("getPageNumWords", System.Reflection.BindingFlags.InvokeMethod, null, jso, args, null);
                            int numWords = Int32.Parse(jsNumWords.ToString());
                            for (int j = 0; j <= numWords; j++)
                            {
                                object[] argsj = new object[] { i, j, false };
                                jsWord = jso.GetType().InvokeMember("getPageNthWord", System.Reflection.BindingFlags.InvokeMethod, null, jso, argsj, null);
                                words.Add((string)jsWord);
                            }
                        }
                        foreach (string word in words)
                        {
                            //取得当前page内容
                            pageText += word;
                        }
                    }
                    catch
                    {
                    }
    
                    //当前页内容加入list
                    txt.Add(new KeyValuePair<string, string>((i + 1).ToString(), pageText));
                    pageText = "";
    
                    jso = null;
                }
                return txt;
            }

    在这个基础之上我们再写一些比如搜索PDF内容的功能就容易多了吧。

    补充:这里有一个问题,当遇到PDF排版是纵向的时候,读出来的是乱码,因为行是横向的。这个困扰我很久了,大家如果有思路的话可以说出来交流一下。

  • 相关阅读:
    @media screen响应式
    gulp轻松上手
    Node.js基本讲解
    百度地图
    SQL语言(增删改查)
    AJAX基本介绍(web前端)
    找出链表的第一个公共节点
    微软算法100题58 从尾到头输出链表(java)
    最长递增子序列
    各种排序算法
  • 原文地址:https://www.cnblogs.com/Mushrooms/p/3652325.html
Copyright © 2011-2022 走看看