zoukankan      html  css  js  c++  java
  • 从pdf中读取表格数据并且写入datatable中

    首先引入第三方组件的使用Tabula,这个是一个开源的组件,该组件基于pdfpig组件实现

            /// <summary>
            /// 提取表格的方法
            /// </summary>
            /// <param name="pdfPath"></param>
            /// <param name="startNumber"></param>
            /// <param name="endNumber"></param>
            /// <returns></returns>
            private List<DataTable> ExtractTables(string pdfPath, int startNumber, int endNumber)
            {
                try
                {
                    using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(pdfPath, new ParsingOptions() { ClipPaths = true }))
                    {
    
                        ObjectExtractor oe = new ObjectExtractor(document);
                        IExtractionAlgorithm ea = new SpreadsheetExtractionAlgorithm();
                        var pagesNumber = document.NumberOfPages;
                        if (startNumber < pagesNumber && endNumber > pagesNumber)
                        {
                            endNumber = pagesNumber;
                        }
                        if (startNumber > pagesNumber || endNumber > pagesNumber)
                        {
                            throw new IndexOutOfRangeException("页码超出范围!");
                        }
    
                        List<DataTable> dtList = new List<DataTable>();
    
                        for (int i = startNumber; i <= endNumber; i++)
                        {
    
                            PageArea page = oe.Extract(i);
                            List<Table> tables = ea.Extract(page);
                            foreach (Table tb in tables)
                            {
                                DataTable dt = new DataTable();
                                var columnCount = tb.ColumnCount;
                                for (int b = 0; b < columnCount; b++)
                                {
                                    dt.Columns.Add(b.ToString(), typeof(string));
                                }
                                var rows = tb.Rows;
                                foreach (IReadOnlyList<Cell> row in tb.Rows)
                                {
                                    DataRow dr = dt.NewRow();
                                    for (int c = 0; c < columnCount; c++)
                                    {
                                        dr[c] = row[c];
                                    }
                                    dt.Rows.Add(dr);
                                }
                                dtList.Add(dt);
                            }
                        }
                        return dtList;
                    }
    
                }
                catch (Exception ex)
                {
                    throw ex;
                }
            }
  • 相关阅读:
    4种定位的区别
    tab切换插件
    CPU的cache知识
    linux free命令详解
    关于登录linux时,/etc/profile、~/.bash_profile等几个文件的执行过程
    职业规范(运维)
    数据库的横表和纵表
    Linux下的xargs的用法
    LINUX ulimit命令
    防火墙并发连接数
  • 原文地址:https://www.cnblogs.com/njcxwz/p/15637239.html
Copyright © 2011-2022 走看看