zoukankan      html  css  js  c++  java
  • c#抽取pdf文档标题(2)

     1  public class IETitle
     2     {
     3         public static List<WordInfo> WordsInfo = new List<WordInfo>();
     4 
     5         private static string pdfcontent;
     6         public static HandleResult GetTitle(string path, string realtitle)
     7         {
     8             WordsInfo.Clear();
     9 
    10             string content = string.Empty;
    11             try
    12             {
    13                 content = ITextSharpLib.ExtractTextFromPdf(path);
    14             }
    15             catch
    16             {
    17                 try
    18                 {
    19                     content = PDFBoxLib.Pdf2txt(path);
    20                 }
    21                 catch (Exception ex)
    22                 {
    23 
    24                 }
    25             }
    26 
    27             pdfcontent = content;
    28 
    29             PDFBoxLib.HandleContent(path);
    30 
    31             //处理字符
    32 
    33             Word w = new Word();
    34             w.MakeWord(WordsInfo);
    35 
    36             Line line = new Line();
    37             line.MakeLine(w);
    38 
    39             //处理行
    40             Block block = new Block();
    41             block.MakeBlock(line);
    42 
    43             //获取全部的文本
    44             string text = string.Empty;
    45 
    46             try
    47             {
    48                 text = ITextSharpLib.ExtractTextFromPdf(path, 0);
    49             }
    50             catch (Exception ex)
    51             {
    52                 text = content;
    53             }
    54 
    55             HandleResult title = new HandleResult() { Title = "" };
    56 
    57             try
    58             {
    59                 var sentences = text.Split('
    ');
    60 
    61                 InfoExtract ie = new InfoExtract(sentences, text);
    62 
    63                 title = ie.ExtractTitle(block, realtitle);
    64 
    65             }
    66             catch (Exception ex)
    67             {
    68                 Logger.Debug(ex.Message);
    69             }
    70 
    71             return title;
    72         }
    73     }

    上面就是获取标题的整体逻辑代码。29行,是调用pdfboxLib,读取pdf第一页内容:

     1  public static string HandleContent(string fileName, int pageIndex = 1)
     2         {
     3             try
     4             {
     5                 PDDocument document = null;
     6                 try
     7                 {
     8                     document = PDDocument.load(fileName);
     9                     List allPages = document.getDocumentCatalog().getAllPages();
    10 
    11                     int size = pageIndex == 0 ? allPages.size() : 1;
    12 
    13                     for (int i = 0; i < size; i++)
    14                     {
    15                         var page = (PDPage)allPages.get(i);
    16 
    17                         var contents = page.getContents();
    18 
    19                         PrintTextLocatins2 printer = new PrintTextLocatins2();
    20 
    21                         if (contents != null)
    22                         {
    23                             printer.processStream(page, page.findResources(), page.getContents().getStream());
    24                         }
    25                     }
    26                 }
    27                 catch (Exception ex)
    28                 {
    29                 }
    30                 finally
    31                 {
    32                     if (document != null)
    33                     {
    34                         document.close();
    35                     }
    36                 }
    37             }
    38             catch (Exception ex)
    39             {
    40 
    41             }
    42             return "";
    43         }

    第23行 printer.processStream方法,会触发自定义类PrintTextLocation2类中的字符处理方法 processTextPosition:

     1  public class PrintTextLocatins2 : PDFTextStripper
     2     {
     3         private static int BOLD_F_NUM = 2;
     4         private static String[] BOLD_FLAGS = { "Bold", "CAJ FNT04" };
     5         private static int ITALIC_F_NUM = 2;
     6         private static String[] ITALIC_FLAGS = { "Italic", "CAJ FNT03" };
     7 
     8         private static bool IsBold(String font)
     9         {
    10             int i;
    11             for (i = 0; i < BOLD_F_NUM; i++)
    12                 if (font.Contains(BOLD_FLAGS[i]))
    13                     return true;
    14             return false;
    15         }
    16 
    17         private static bool IsItalic(String font)
    18         {
    19             int i;
    20             for (i = 0; i < ITALIC_F_NUM; i++)
    21                 if (font.Contains(ITALIC_FLAGS[i]))
    22                     return true;
    23             return false;
    24         }
    25 
    26         public PrintTextLocatins2()
    27         {
    28             base.setSortByPosition(false);
    29         }
    30         protected override void processTextPosition(TextPosition text)
    31         {
    32 
    33             WordInfo info = new WordInfo()
    34             {
    35                 X = text.getX(),
    36                 Y = text.getY(),
    37                 XDirAdj = text.getXDirAdj(),
    38                 YDirAdj = text.getYDirAdj(),
    39                 FontSize = text.getFontSize(),
    40                 Xscale = text.getXScale(),
    41                 Yscale = text.getYScale(),
    42                 Height = text.getHeight(),
    43                 Space = text.getWidthOfSpace(),
    44                 Width = text.getWidth(),
    45 
    46                 Subfont = text.getFont().getSubType(),
    47                 Basefont = text.getFont().getBaseFont(),
    48                 IsBold = IsBold(text.getFont().getBaseFont()),
    49                 IsItalic = IsItalic(text.getFont().getBaseFont()),
    50 
    51                 XSize = (int)(text.getFontSize() * text.getXScale()),
    52 
    53                 YSize = (int)(text.getFontSize() * text.getYScale()),
    54 
    55                 Word = text.getCharacter()
    56             };
    57 
    58 
    59             if (info.Space.ToString() == "非数字")
    60             {
    61                 info.Space = 0;
    62             }
    63 
    64             IETitle.WordsInfo.Add(info);
    65         }
    66     }

    这样我们就利用pdfbox收集了pdf文档的字符信息。

  • 相关阅读:
    Android 中常用代码片段
    查看oracle中的中文所占字节数
    order by 中 使用decode
    Oracle select 中case 的使用以及使用decode替换case
    pyqt5 'QWidget' object has no attribute 'setCentralWidget'(转)
    程序员之路:python3+PyQt5+pycharm桌面GUI开发(转)
    QT5入门之23 -QT串口编程(转)
    xpath-helper: 谷歌浏览器安装xpath helper 插件
    mysql给root开启远程访问权限
    Vmware无法获取快照信息 锁定文件失败
  • 原文地址:https://www.cnblogs.com/wangqiang3311/p/7743282.html
Copyright © 2011-2022 走看看