前提是PDF里面是有文字的!
一次性取得所有页内容:
/// <summary> /// 改进前取得所有页的所有word /// </summary> /// <param name="pdfFilePath"></param> /// <param name="txtDirectoryPath"></param> public static void ConvertPdfToTxt(string pdfFilePath, string txtDirectoryPath) { CAcroAVDoc avDoc = (Acrobat.CAcroAVDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.AVDoc"); //set AVDoc object CAcroPDDoc pdDoc; //open the PDF if (avDoc.Open(pdfFilePath, "")) { pdDoc = (CAcroPDDoc)avDoc.GetPDDoc(); Object jsAcroObj = pdDoc.GetJSObject(); Type T = jsAcroObj.GetType(); object[] saveAsParam = { txtDirectoryPath, "com.adobe.acrobat.accesstext" }; T.InvokeMember("saveAs", BindingFlags.InvokeMethod | BindingFlags.Public | BindingFlags.Instance, null, jsAcroObj, saveAsParam); object[] closeDocParam = { true }; T.InvokeMember("closeDoc", BindingFlags.InvokeMethod | BindingFlags.Public | BindingFlags.Instance, null, jsAcroObj, closeDocParam); if (!avDoc.Close(1)) avDoc.Close(1); } }
逐页取出:
/// <summary> /// 改进后取得每一页的所有word /// </summary> /// <param name="pdDoc"></param> /// <returns></returns> public static List<KeyValuePair<String, String>> PdDocGetText(AcroPDDoc pdDoc) { List<KeyValuePair<String, String>> txt = new List<KeyValuePair<string, string>>(); AcroPDPage page; int pages = pdDoc.GetNumPages(); string pageText = ""; for (int i = 0; i < pages; i++) { page = (AcroPDPage)pdDoc.AcquirePage(i); object jso, jsNumWords, jsWord; List<string> words = new List<string>(); try { jso = pdDoc.GetJSObject(); if (jso != null) { object[] args = new object[] { i }; jsNumWords = jso.GetType().InvokeMember("getPageNumWords", System.Reflection.BindingFlags.InvokeMethod, null, jso, args, null); int numWords = Int32.Parse(jsNumWords.ToString()); for (int j = 0; j <= numWords; j++) { object[] argsj = new object[] { i, j, false }; jsWord = jso.GetType().InvokeMember("getPageNthWord", System.Reflection.BindingFlags.InvokeMethod, null, jso, argsj, null); words.Add((string)jsWord); } } foreach (string word in words) { //取得当前page内容 pageText += word; } } catch { } //当前页内容加入list txt.Add(new KeyValuePair<string, string>((i + 1).ToString(), pageText)); pageText = ""; jso = null; } return txt; }
在这个基础之上我们再写一些比如搜索PDF内容的功能就容易多了吧。
补充:这里有一个问题,当遇到PDF排版是纵向的时候,读出来的是乱码,因为行是横向的。这个困扰我很久了,大家如果有思路的话可以说出来交流一下。