itextsharp是开源的组件,可以用于读取pdf的文本内容,要求可以逐行读取,可以空格区分间隔大的文本块,并且支持一定的误差兼容
public class TextAsParagraphsExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy { //Text buffer private StringBuilder result = new StringBuilder(); //Store last used properties private Vector lastStartPoint; private Vector lastEndPoint; //Buffer of lines of text and their Y coordinates. NOTE, these should be exposed as properties instead of fields but are left as is for simplicity's sake public List<string> strings = new List<String>(); public List<float> baselines = new List<float>(); //This is called whenever a run of text is encountered public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo) { //This code assumes that if the baseline changes then we're on a newline var text = renderInfo.GetText(); LineSegment line = renderInfo.GetBaseline(); Vector curStartPoint = renderInfo.GetBaseline().GetStartPoint(); Vector curEndPoint = renderInfo.GetBaseline().GetEndPoint(); //See if the baseline has changed if ((this.lastStartPoint != null) && (curStartPoint[Vector.I2] <= lastStartPoint[Vector.I2] - 2)) { //See if we have text and not just whitespace if ((!String.IsNullOrWhiteSpace(this.result.ToString()))) { //Mark the previous line as done by adding it to our buffers this.baselines.Add(this.lastStartPoint[Vector.I2]); this.strings.Add(this.result.ToString()); } //Reset our "line" buffer this.result.Clear(); } else { if ((this.lastEndPoint != null) && (curStartPoint[Vector.I1] >= lastEndPoint[Vector.I1] + 2)) { this.result.Append(" "); } } //Append the current text to our line buffer this.result.Append(renderInfo.GetText()); //Reset the last used line this.lastStartPoint = curStartPoint; this.lastEndPoint = curEndPoint; } public string GetResultantText() { //One last time, see if there's anything left in the buffer if ((!String.IsNullOrWhiteSpace(this.result.ToString()))) { this.baselines.Add(this.lastStartPoint[Vector.I2]); this.strings.Add(this.result.ToString()); } //We're not going to use this method to return a string, instead after callers should inspect this class's strings and baselines fields. return null; } //Not needed, part of interface contract public void BeginTextBlock() { } public void EndTextBlock() { } public void RenderImage(ImageRenderInfo renderInfo) { } }
调用读取文本的内容
PdfReader reader = new PdfReader(@"d:\20212.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextAsParagraphsExtractionStrategy S = new TextAsParagraphsExtractionStrategy(); iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S); StringBuilder sb = new StringBuilder(); for (int i = 0; i < S.strings.Count; i++) { sb.AppendLine(string.Format("Line {0,-5}: {1}", S.baselines[i], S.strings[i])); } var sss = sb.ToString(); reader.Close();