zoukankan      html  css  js  c++  java
  • PDF抽取文字 C# with Adobe API

    前提是PDF里面是有文字的!

    一次性取得所有页内容:

            /// <summary>
            /// 改进前取得所有页的所有word
            /// </summary>
            /// <param name="pdfFilePath"></param>
            /// <param name="txtDirectoryPath"></param>
            public static void ConvertPdfToTxt(string pdfFilePath, string txtDirectoryPath)
            {
                CAcroAVDoc avDoc = (Acrobat.CAcroAVDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.AVDoc"); //set AVDoc object 
                CAcroPDDoc pdDoc;
                //open the PDF
                if (avDoc.Open(pdfFilePath, ""))
                {
                    pdDoc = (CAcroPDDoc)avDoc.GetPDDoc();
                    Object jsAcroObj = pdDoc.GetJSObject();
                    Type T = jsAcroObj.GetType();
                    object[] saveAsParam = { txtDirectoryPath, "com.adobe.acrobat.accesstext" };
                    T.InvokeMember("saveAs",
                      BindingFlags.InvokeMethod |
                      BindingFlags.Public |
                      BindingFlags.Instance,
                      null, jsAcroObj, saveAsParam);
                    object[] closeDocParam = { true };
                    T.InvokeMember("closeDoc",
                      BindingFlags.InvokeMethod |
                      BindingFlags.Public |
                      BindingFlags.Instance,
                      null, jsAcroObj, closeDocParam);
    
                    if (!avDoc.Close(1)) avDoc.Close(1);
                }
            }

    逐页取出:

            /// <summary>
            /// 改进后取得每一页的所有word
            /// </summary>
            /// <param name="pdDoc"></param>
            /// <returns></returns>
            public static List<KeyValuePair<String, String>> PdDocGetText(AcroPDDoc pdDoc)
            {
                List<KeyValuePair<String, String>> txt = new List<KeyValuePair<string, string>>();
                AcroPDPage page;
                int pages = pdDoc.GetNumPages();
                string pageText = "";
                for (int i = 0; i < pages; i++)
                {
                    page = (AcroPDPage)pdDoc.AcquirePage(i);
                    object jso, jsNumWords, jsWord;
                    List<string> words = new List<string>();
                    try
                    {
                        jso = pdDoc.GetJSObject();
                        if (jso != null)
                        {
                            object[] args = new object[] { i };
                            jsNumWords = jso.GetType().InvokeMember("getPageNumWords", System.Reflection.BindingFlags.InvokeMethod, null, jso, args, null);
                            int numWords = Int32.Parse(jsNumWords.ToString());
                            for (int j = 0; j <= numWords; j++)
                            {
                                object[] argsj = new object[] { i, j, false };
                                jsWord = jso.GetType().InvokeMember("getPageNthWord", System.Reflection.BindingFlags.InvokeMethod, null, jso, argsj, null);
                                words.Add((string)jsWord);
                            }
                        }
                        foreach (string word in words)
                        {
                            //取得当前page内容
                            pageText += word;
                        }
                    }
                    catch
                    {
                    }
    
                    //当前页内容加入list
                    txt.Add(new KeyValuePair<string, string>((i + 1).ToString(), pageText));
                    pageText = "";
    
                    jso = null;
                }
                return txt;
            }

    在这个基础之上我们再写一些比如搜索PDF内容的功能就容易多了吧。

    补充:这里有一个问题,当遇到PDF排版是纵向的时候,读出来的是乱码,因为行是横向的。这个困扰我很久了,大家如果有思路的话可以说出来交流一下。

  • 相关阅读:
    sys模块详解
    os模块详解2
    tyvj 1203 机器分配
    洛谷 P1496 火烧赤壁
    P1204 [USACO1.2]挤牛奶Milking Cows
    bzoj 2120 数颜色
    P2056 采花
    P1972 [SDOI2009]HH的项链
    9.20模拟赛
    P2709 小B的询问
  • 原文地址:https://www.cnblogs.com/Mushrooms/p/3652325.html
Copyright © 2011-2022 走看看