zoukankan      html  css  js  c++  java
  • itextsharp读取pdf文档内容

    itextsharp是开源的组件,可以用于读取pdf的文本内容,要求可以逐行读取,可以空格区分间隔大的文本块,并且支持一定的误差兼容

     public class TextAsParagraphsExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
        {
            //Text buffer 
            private StringBuilder result = new StringBuilder();
    
            //Store last used properties 
            private Vector lastStartPoint;
            private Vector lastEndPoint;
    
            //Buffer of lines of text and their Y coordinates. NOTE, these should be exposed as properties instead of fields but are left as is for simplicity's sake 
            public List<string> strings = new List<String>();
            public List<float> baselines = new List<float>();
    
            //This is called whenever a run of text is encountered 
            public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
            {
    
                //This code assumes that if the baseline changes then we're on a newline 
                var text = renderInfo.GetText();
                LineSegment line = renderInfo.GetBaseline();
                Vector curStartPoint = renderInfo.GetBaseline().GetStartPoint();
                Vector curEndPoint = renderInfo.GetBaseline().GetEndPoint();
                //See if the baseline has changed 
                if ((this.lastStartPoint != null) && (curStartPoint[Vector.I2] <= lastStartPoint[Vector.I2] - 2))
                {
                    //See if we have text and not just whitespace 
                    if ((!String.IsNullOrWhiteSpace(this.result.ToString())))
                    {
                        //Mark the previous line as done by adding it to our buffers 
                        this.baselines.Add(this.lastStartPoint[Vector.I2]);
                        this.strings.Add(this.result.ToString());
                    }
                    //Reset our "line" buffer 
                    this.result.Clear();
                }
                else
                {
                    if ((this.lastEndPoint != null) && (curStartPoint[Vector.I1] >= lastEndPoint[Vector.I1] + 2))
                    {
                        this.result.Append(" ");
                    }
                }
    
                //Append the current text to our line buffer 
    
                this.result.Append(renderInfo.GetText());
    
                //Reset the last used line 
                this.lastStartPoint = curStartPoint;
                this.lastEndPoint = curEndPoint;
            }
    
            public string GetResultantText()
            {
                //One last time, see if there's anything left in the buffer 
                if ((!String.IsNullOrWhiteSpace(this.result.ToString())))
                {
                    this.baselines.Add(this.lastStartPoint[Vector.I2]);
                    this.strings.Add(this.result.ToString());
                }
                //We're not going to use this method to return a string, instead after callers should inspect this class's strings and baselines fields. 
                return null;
            }
    
            //Not needed, part of interface contract 
            public void BeginTextBlock() { }
            public void EndTextBlock() { 
            }
            public void RenderImage(ImageRenderInfo renderInfo) { }
        }

    调用读取文本的内容

                PdfReader reader = new PdfReader(@"d:\20212.pdf");
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);
                TextAsParagraphsExtractionStrategy S = new TextAsParagraphsExtractionStrategy();
                iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < S.strings.Count; i++)
                {
                    sb.AppendLine(string.Format("Line {0,-5}: {1}", S.baselines[i], S.strings[i]));
                }
                var sss = sb.ToString();
    
                reader.Close();
  • 相关阅读:
    Oracle PL/SQL中如何使用%TYPE和%ROWTYPE
    SVN使用教程总结
    实践SQLServer Tuning
    SQL性能优化:如何定位网络性能问题
    windows7下修改hosts文件无效解决办法
    jQuery Event.stopPropagation() 函数详解
    引用js或css后加?v= 版本号的用法
    JS实现点击跳转登陆邮箱
    DWZ (JUI) 教程 navTab 刷新分析
    CPU与内存的关系
  • 原文地址:https://www.cnblogs.com/njcxwz/p/15637178.html
Copyright © 2011-2022 走看看