zoukankan      html  css  js  c++  java
  • C#解析PDF

    C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。

    PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。

    文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。

    ITestSharp是一C#开源项目,PdfBox为Java开源项目,借助于IKVM在.Net平台下有实现。

    Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。

    OCR使用的是Asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。

    附上ITestSharp、PdfBox对PDF的解析代码。

    ITestSharp辅助类

      1 using System;
      2 using System.Collections.Generic;
      3 using System.Text;
      4 
      5 using iTextSharp.text.pdf;
      6 using iTextSharp.text.pdf.parser;
      7 using System.IO;
      8 
      9 namespace eyuan
     10 {
     11     public static class ITextSharpHandler
     12     {
     13         /// <summary>
     14         /// 读取PDF文本内容
     15         /// </summary>
     16         /// <param name="fileName"></param>
     17         /// <returns></returns>
     18         public static string ReadPdf(string fileName)
     19         {
     20             if (!File.Exists(fileName))
     21             {
     22                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
     23                 return string.Empty;
     24             }
     25             //
     26             string fileContent = string.Empty;
     27             StringBuilder sbFileContent = new StringBuilder();
     28             //打开文件
     29             PdfReader reader = null;
     30             try
     31             {
     32                 reader = new PdfReader(fileName);
     33             }
     34             catch (Exception ex)
     35             {
     36                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
     37 
     38                 if (reader != null)
     39                 {
     40                     reader.Close();
     41                     reader = null;
     42                 }
     43 
     44                 return string.Empty;
     45             }
     46 
     47             try
     48             {
     49                 //循环各页(索引从1开始)
     50                 for (int i = 1; i <= reader.NumberOfPages; i++)
     51                 {
     52                     sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
     53 
     54                 }
     55 
     56             }
     57             catch (Exception ex)
     58             {
     59                 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
     60 
     61             }
     62             finally
     63             {
     64                 if (reader != null)
     65                 {
     66                     reader.Close();
     67                     reader = null;
     68                 }
     69             }
     70             //
     71             fileContent = sbFileContent.ToString();
     72             return fileContent;
     73         }
     74         /// <summary>
     75         /// 获取PDF页数
     76         /// </summary>
     77         /// <param name="fileName"></param>
     78         /// <returns></returns>
     79         public static int GetPdfPageCount(string fileName)
     80         {
     81             if (!File.Exists(fileName))
     82             {
     83                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
     84                 return -1;
     85             }
     86             //打开文件
     87             PdfReader reader = null;
     88             try
     89             {
     90                 reader = new PdfReader(fileName);
     91             }
     92             catch (Exception ex)
     93             {
     94                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
     95 
     96                 if (reader != null)
     97                 {
     98                     reader.Close();
     99                     reader = null;
    100                 }
    101 
    102                 return -1;
    103             }
    104             //
    105             return reader.NumberOfPages;
    106         }
    107     }
    108 }

     PDFBox辅助类

     1 using org.pdfbox.pdmodel;
     2 using org.pdfbox.util;
     3 using System;
     4 using System.Collections.Generic;
     5 using System.IO;
     6 using System.Text;
     7 
     8 namespace eyuan
     9 {
    10     public static class PdfBoxHandler
    11     {
    12         /// <summary>
    13         /// 使用PDFBox组件进行解析
    14         /// </summary>
    15         /// <param name="input">PDF文件路径</param>
    16         /// <returns>PDF文本内容</returns>
    17         public static string ReadPdf(string input)
    18         {
    19             if (!File.Exists(input))
    20             {
    21                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);
    22                 return null;
    23             }
    24             else
    25             {
    26                 PDDocument pdfdoc = null;
    27                 string strPDFText = null;
    28                 PDFTextStripper stripper = null;
    29 
    30                 try
    31                 {
    32                     //加载PDF文件
    33                     pdfdoc = PDDocument.load(input);
    34                 }
    35                 catch (Exception ex)
    36                 {
    37                     LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
    38 
    39                     if (pdfdoc != null)
    40                     {
    41                         pdfdoc.close();
    42                         pdfdoc = null;
    43                     }
    44 
    45                     return null;
    46                 }
    47 
    48                 try
    49                 {
    50                     //解析PDF文件
    51                     stripper = new PDFTextStripper();
    52                     strPDFText = stripper.getText(pdfdoc);
    53 
    54                    
    55 
    56                 }
    57                 catch (Exception ex)
    58                 {
    59                     LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
    60 
    61                 }
    62                 finally
    63                 {
    64                     if (pdfdoc != null)
    65                     {
    66                         pdfdoc.close();
    67                         pdfdoc = null;
    68                     }
    69                 }
    70 
    71                 return strPDFText;
    72             }
    73 
    74         }
    75     }
    76 }

    另外附上PDF转Image,然后对Image进行OCR的代码。

    转换PDF为Jpeg图片代码(GhostScript辅助类)

      1 using System;
      2 using System.Collections;
      3 using System.Collections.Generic;
      4 using System.Runtime.InteropServices;
      5 using System.Text;
      6 
      7 namespace eyuan
      8 {
      9     public class GhostscriptHandler
     10     {
     11 
     12         #region GhostScript Import
     13         /// <summary>创建Ghostscript的实例
     14         /// This instance is passed to most other gsapi functions. 
     15         /// The caller_handle will be provided to callback functions.  
     16         ///  At this stage, Ghostscript supports only one instance. </summary>  
     17         /// <param name="pinstance"></param>  
     18         /// <param name="caller_handle"></param>  
     19         /// <returns></returns>   
     20         [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")]
     21         private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle);
     22         /// <summary>This is the important function that will perform the conversion
     23         /// 
     24         /// </summary>  
     25         /// <param name="instance"></param>  
     26         /// <param name="argc"></param>  
     27         /// <param name="argv"></param>  
     28         /// <returns></returns>  
     29         [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")]
     30         private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv);
     31         /// <summary>  
     32         /// Exit the interpreter. 
     33         /// This must be called on shutdown if gsapi_init_with_args() has been called, 
     34         /// and just before gsapi_delete_instance().
     35         /// 退出
     36         /// </summary>  
     37         /// <param name="instance"></param>  
     38         /// <returns></returns>  
     39         [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")]
     40         private static extern int gsapi_exit(IntPtr instance);
     41         /// <summary>  
     42         /// Destroy an instance of Ghostscript. 
     43         /// Before you call this, Ghostscript must have finished. 
     44         /// If Ghostscript has been initialised, you must call gsapi_exit before gsapi_delete_instance.   
     45         /// 销毁实例
     46         /// </summary>  
     47         /// <param name="instance"></param>  
     48         [DllImport("gsdll32.dll", EntryPoint = "gsapi_delete_instance")]
     49         private static extern void gsapi_delete_instance(IntPtr instance);
     50         #endregion
     51 
     52         #region 变量
     53         private string _sDeviceFormat;
     54         private int _iWidth;
     55         private int _iHeight;
     56         private int _iResolutionX;
     57         private int _iResolutionY;
     58         private int _iJPEGQuality;
     59         private Boolean _bFitPage;
     60         private IntPtr _objHandle;
     61         #endregion
     62 
     63         #region 属性
     64         /// <summary>
     65         /// 输出格式
     66         /// </summary>
     67         public string OutputFormat
     68         {
     69             get { return _sDeviceFormat; }
     70             set { _sDeviceFormat = value; }
     71         }
     72         /// <summary>
     73         /// 
     74         /// </summary>
     75         public int Width
     76         {
     77             get { return _iWidth; }
     78             set { _iWidth = value; }
     79         }
     80         /// <summary>
     81         /// 
     82         /// </summary>
     83         public int Height
     84         {
     85             get { return _iHeight; }
     86             set { _iHeight = value; }
     87         }
     88         /// <summary>
     89         /// 
     90         /// </summary>
     91         public int ResolutionX
     92         {
     93             get { return _iResolutionX; }
     94             set { _iResolutionX = value; }
     95         }
     96         /// <summary>
     97         /// 
     98         /// </summary>
     99         public int ResolutionY
    100         {
    101             get { return _iResolutionY; }
    102             set { _iResolutionY = value; }
    103         }
    104         /// <summary>
    105         /// 
    106         /// </summary>
    107         public Boolean FitPage
    108         {
    109             get { return _bFitPage; }
    110             set { _bFitPage = value; }
    111         }
    112         /// <summary>Quality of compression of JPG
    113         /// Jpeg文档质量
    114         /// </summary>  
    115         public int JPEGQuality
    116         {
    117             get { return _iJPEGQuality; }
    118             set { _iJPEGQuality = value; }
    119         }
    120         #endregion
    121 
    122         #region 初始化(实例化对象)
    123         /// <summary>
    124         /// 
    125         /// </summary>
    126         /// <param name="objHandle"></param>
    127         public GhostscriptHandler(IntPtr objHandle)
    128         {
    129             _objHandle = objHandle;
    130         }
    131         public GhostscriptHandler()
    132         {
    133             _objHandle = IntPtr.Zero;
    134         }
    135         #endregion
    136 
    137         #region 字符串处理
    138         /// <summary>
    139         /// 转换Unicode字符串到Ansi字符串
    140         /// </summary>
    141         /// <param name="str">Unicode字符串</param>
    142         /// <returns>Ansi字符串(字节数组格式)</returns>
    143         private byte[] StringToAnsiZ(string str)
    144         {
    145             //' Convert a Unicode string to a null terminated Ansi string for Ghostscript.  
    146             //' The result is stored in a byte array. Later you will need to convert  
    147             //' this byte array to a pointer with GCHandle.Alloc(XXXX, GCHandleType.Pinned)  
    148             //' and GSHandle.AddrOfPinnedObject()  
    149             int intElementCount;
    150             int intCounter;
    151             byte[] aAnsi;
    152             byte bChar;
    153             intElementCount = str.Length;
    154             aAnsi = new byte[intElementCount + 1];
    155             for (intCounter = 0; intCounter < intElementCount; intCounter++)
    156             {
    157                 bChar = (byte)str[intCounter];
    158                 aAnsi[intCounter] = bChar;
    159             }
    160             aAnsi[intElementCount] = 0;
    161             return aAnsi;
    162         }
    163         #endregion
    164 
    165         #region 转换文件
    166         /// <summary>
    167         /// 转换文件
    168         /// </summary>
    169         /// <param name="inputFile">输入的PDF文件路径</param>
    170         /// <param name="outputFile">输出的Jpeg图片路径</param>
    171         /// <param name="firstPage">第一页</param>
    172         /// <param name="lastPage">最后一页</param>
    173         /// <param name="deviceFormat">格式(文件格式)</param>
    174         /// <param name="width">宽度</param>
    175         /// <param name="height">高度</param>
    176         public void Convert(string inputFile, string outputFile,
    177             int firstPage, int lastPage, string deviceFormat, int width, int height)
    178         {
    179             //判断文件是否存在
    180             if (!System.IO.File.Exists(inputFile))
    181             {
    182                 LogHandler.LogWrite(string.Format("文件{0}不存在", inputFile));
    183                 return;
    184             }
    185             int intReturn;
    186             IntPtr intGSInstanceHandle;
    187             object[] aAnsiArgs;
    188             IntPtr[] aPtrArgs;
    189             GCHandle[] aGCHandle;
    190             int intCounter;
    191             int intElementCount;
    192             IntPtr callerHandle;
    193             GCHandle gchandleArgs;
    194             IntPtr intptrArgs;
    195             string[] sArgs = GetGeneratedArgs(inputFile, outputFile,
    196                 firstPage, lastPage, deviceFormat, width, height);
    197             // Convert the Unicode strings to null terminated ANSI byte arrays  
    198             // then get pointers to the byte arrays.  
    199             intElementCount = sArgs.Length;
    200             aAnsiArgs = new object[intElementCount];
    201             aPtrArgs = new IntPtr[intElementCount];
    202             aGCHandle = new GCHandle[intElementCount];
    203             // Create a handle for each of the arguments after   
    204             // they've been converted to an ANSI null terminated  
    205             // string. Then store the pointers for each of the handles  
    206             for (intCounter = 0; intCounter < intElementCount; intCounter++)
    207             {
    208                 aAnsiArgs[intCounter] = StringToAnsiZ(sArgs[intCounter]);
    209                 aGCHandle[intCounter] = GCHandle.Alloc(aAnsiArgs[intCounter], GCHandleType.Pinned);
    210                 aPtrArgs[intCounter] = aGCHandle[intCounter].AddrOfPinnedObject();
    211             }
    212             // Get a new handle for the array of argument pointers  
    213             gchandleArgs = GCHandle.Alloc(aPtrArgs, GCHandleType.Pinned);
    214             intptrArgs = gchandleArgs.AddrOfPinnedObject();
    215             intReturn = gsapi_new_instance(out intGSInstanceHandle, _objHandle);
    216             callerHandle = IntPtr.Zero;
    217             try
    218             {
    219                 intReturn = gsapi_init_with_args(intGSInstanceHandle, intElementCount, intptrArgs);
    220             }
    221             catch (Exception ex)
    222             {
    223                  LogHandler.LogWrite(string.Format("PDF文件{0}转换失败.
    错误:{1}",new string[]{inputFile,ex.ToString()}));
    224 
    225             }
    226             finally
    227             {
    228                 for (intCounter = 0; intCounter < intReturn; intCounter++)
    229                 {
    230                     aGCHandle[intCounter].Free();
    231                 }
    232                 gchandleArgs.Free();
    233                 gsapi_exit(intGSInstanceHandle);
    234                 gsapi_delete_instance(intGSInstanceHandle);
    235             }
    236         }
    237         #endregion
    238 
    239         #region 转换文件
    240         /// <summary>
    241         /// 
    242         /// </summary>
    243         /// <param name="inputFile"></param>
    244         /// <param name="outputFile"></param>
    245         /// <param name="firstPage"></param>
    246         /// <param name="lastPage"></param>
    247         /// <param name="deviceFormat"></param>
    248         /// <param name="width"></param>
    249         /// <param name="height"></param>
    250         /// <returns></returns>
    251         private string[] GetGeneratedArgs(string inputFile, string outputFile,
    252             int firstPage, int lastPage, string deviceFormat, int width, int height)
    253         {
    254             this._sDeviceFormat = deviceFormat;
    255             this._iResolutionX = width;
    256             this._iResolutionY = height;
    257             // Count how many extra args are need - HRangel - 11/29/2006, 3:13:43 PM  
    258             ArrayList lstExtraArgs = new ArrayList();
    259             if (_sDeviceFormat == "jpg" && _iJPEGQuality > 0 && _iJPEGQuality < 101)
    260                 lstExtraArgs.Add("-dJPEGQ=" + _iJPEGQuality);
    261             if (_iWidth > 0 && _iHeight > 0)
    262                 lstExtraArgs.Add("-g" + _iWidth + "x" + _iHeight);
    263             if (_bFitPage)
    264                 lstExtraArgs.Add("-dPDFFitPage");
    265             if (_iResolutionX > 0)
    266             {
    267                 if (_iResolutionY > 0)
    268                     lstExtraArgs.Add("-r" + _iResolutionX + "x" + _iResolutionY);
    269                 else
    270                     lstExtraArgs.Add("-r" + _iResolutionX);
    271             }
    272             // Load Fixed Args - HRangel - 11/29/2006, 3:34:02 PM  
    273             int iFixedCount = 17;
    274             int iExtraArgsCount = lstExtraArgs.Count;
    275             string[] args = new string[iFixedCount + lstExtraArgs.Count];
    276             /* 
    277             // Keep gs from writing information to standard output 
    278         "-q",                      
    279         "-dQUIET", 
    280 
    281         "-dPARANOIDSAFER", // Run this command in safe mode 
    282         "-dBATCH", // Keep gs from going into interactive mode 
    283         "-dNOPAUSE", // Do not prompt and pause for each page 
    284         "-dNOPROMPT", // Disable prompts for user interaction            
    285         "-dMaxBitmap=500000000", // Set high for better performance 
    286 
    287         // Set the starting and ending pages 
    288         String.Format("-dFirstPage={0}", firstPage), 
    289         String.Format("-dLastPage={0}", lastPage),    
    290 
    291         // Configure the output anti-aliasing, resolution, etc 
    292         "-dAlignToPixels=0", 
    293         "-dGridFitTT=0", 
    294         "-sDEVICE=jpeg", 
    295         "-dTextAlphaBits=4", 
    296         "-dGraphicsAlphaBits=4", 
    297             */
    298             args[0] = "pdf2img";//this parameter have little real use  
    299             args[1] = "-dNOPAUSE";//I don't want interruptions  
    300             args[2] = "-dBATCH";//stop after  
    301             //args[3]="-dSAFER";  
    302             args[3] = "-dPARANOIDSAFER";
    303             args[4] = "-sDEVICE=" + _sDeviceFormat;//what kind of export format i should provide  
    304             args[5] = "-q";
    305             args[6] = "-dQUIET";
    306             args[7] = "-dNOPROMPT";
    307             args[8] = "-dMaxBitmap=500000000";
    308             args[9] = String.Format("-dFirstPage={0}", firstPage);
    309             args[10] = String.Format("-dLastPage={0}", lastPage);
    310             args[11] = "-dAlignToPixels=0";
    311             args[12] = "-dGridFitTT=0";
    312             args[13] = "-dTextAlphaBits=4";
    313             args[14] = "-dGraphicsAlphaBits=4";
    314             //For a complete list watch here:  
    315             //http://pages.cs.wisc.edu/~ghost/doc/cvs/Devices.htm  
    316             //Fill the remaining parameters  
    317             for (int i = 0; i < iExtraArgsCount; i++)
    318             {
    319                 args[15 + i] = (string)lstExtraArgs[i];
    320             }
    321             //Fill outputfile and inputfile  
    322             args[15 + iExtraArgsCount] = string.Format("-sOutputFile={0}", outputFile);
    323             args[16 + iExtraArgsCount] = string.Format("{0}", inputFile);
    324             return args;
    325         }
    326         #endregion
    327 
    328 
    329     }
    330 }

     OCR,识别Image代码(AsPrise辅助类)

     1 using System;
     2 using System.Collections.Generic;
     3 using System.Runtime.InteropServices;
     4 using System.Text;
     5 
     6 namespace PDFCaptureService
     7 {
     8     public static class AspriseOCRHandler
     9     {
    10         #region 外部引用
    11         [DllImport("AspriseOCR.dll", EntryPoint = "OCR", CallingConvention = CallingConvention.Cdecl)]
    12         public static extern IntPtr OCR(string file, int type);
    13         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpart", CallingConvention = CallingConvention.Cdecl)]
    14         static extern IntPtr OCRpart(string file, int type, int startX, int
    15         startY, int width, int height);
    16         [DllImport("AspriseOCR.dll", EntryPoint = "OCRBarCodes", CallingConvention = CallingConvention.Cdecl)]
    17         static extern IntPtr OCRBarCodes(string file, int type);
    18         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpartBarCodes", CallingConvention = CallingConvention.Cdecl)]
    19         static extern IntPtr OCRpartBarCodes(string file, int type, int
    20         startX, int startY, int width, int height);
    21         #endregion
    22 
    23         /// <summary>
    24         /// 
    25         /// </summary>
    26         /// <param name="fileName"></param>
    27         /// <returns></returns>
    28         public static string ReadImage(string fileName)
    29         {
    30             IntPtr ptrFileContent = OCR(fileName, -1);
    31             string fileContent = Marshal.PtrToStringAnsi(ptrFileContent);
    32             //
    33             return fileContent;
    34         }
    35     }
    36 }

     调用示例

    1 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
    2                         string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
    3                         int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
    4                         ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100);
    5                         fileContent = AspriseOCRHandler.ReadImage(fileName);
  • 相关阅读:
    ES6和Node.js的import和export
    重写Router.prototype.push后还报NavigationDuplicated错误的解决方法
    nightwatch对前端做自动化测试
    使用video.js 7在html中播放rtmp视频流
    UEFI开发环境搭建
    类的静态成员
    const成员函数
    类和结构
    最长递增子序列
    C语言将十六进制字符串转化成十六进制
  • 原文地址:https://www.cnblogs.com/mahongbiao/p/3760867.html
Copyright © 2011-2022 走看看