zoukankan      html  css  js  c++  java
  • 读取Word文档中的表格

     1         //读取Word文档中的表格
     2         // DataTable 需要添加引用  using System.Data;
     3         public static DataTable Run()
     4         {
     5             try
     6             {
     7                 //DataTable 申明DataTable变量,保存从Word获取到的数据
     8                 DataTable dt = new DataTable();
     9                 DataColumn dc1 = new DataColumn("One", Type.GetType("System.String"));
    10                 DataColumn dc2 = new DataColumn("TwoText", Type.GetType("System.String"));
    11                 DataColumn dc3 = new DataColumn("TwoHtml", Type.GetType("System.String"));
    12                 dt.Columns.Add(dc1);
    13                 dt.Columns.Add(dc2);
    14                 dt.Columns.Add(dc3);
    15 
    16                 int number = 1;//记录有数据的文档数目
    17                 int troublecount = 0;//记录没有数据的文档数目
    18 
    19                 string Content = "";//申明变量,保存word文档内容
    20                 //获取目录下的所有文件
    21                 //DirectoryInfo  FileInfo  需要添加引用  using System.IO;
    22                 DirectoryInfo dir = new DirectoryInfo("E:/20190917");
    23                 FileInfo[] fileList = dir.GetFiles();
    24                 foreach (var item in fileList)
    25                 {
    26                     object fileName = item.FullName;
    27                     object confirmCovert = false;
    28                     //判断文档类型是否为word文档
    29                     if (item.Extension.ToUpper() == ".DOC" || item.Extension.ToUpper() == ".DOCX")
    30                     {
    31                         //获取word文档内容
    32                         //Application  Document需要添加引用   using Microsoft.Office.Interop.Word;
    33                         Application app = new Application();
    34                         Document doc = null;
    35 
    36                         doc = app.Documents.Open(ref fileName, ref confirmCovert);
    37                         app.Visible = false;
    38                         Content = doc.Content.Text;
    39 
    40                         string[] arr = Content.Split('
    ');
    41                         if (arr.Count() < 2)
    42                         {
    43                             troublecount++;
    44                             Console.WriteLine("文件{0}中没有正文!!!!!!!!。{1}", fileName, troublecount);
    45                             continue;
    46                         }
    47                         else
    48                         {
    49                             //抓取表格内容
    50                             DataRow dr = dt.NewRow();
    51                             dr["One"] = arr[0].ToString();
    52                             int contentIndex = Content.IndexOf("表格显示:");
    53                             List<string> lst = GetContent(doc, Content, contentIndex);
    54 
    55                             dr["TwoText"] = lst[0].ToString();
    56                             dr["TwoHtml"] = lst[1].ToString();
    57                             dt.Rows.Add(dr);
    58 
    59                             number++;
    60                         }
    61 
    62                         doc.Close();
    63                         app.Quit();
    64 
    65                     }
    66                     WriteOuputInformation(string.Format("{0}:文档已经存入数据库。{1}", fileName, number));
    67                 }
    68                 Console.WriteLine("所有文件已读取完毕,共读取了{0}条数据,没有数据的Word文档总条数为{1}", number, troublecount);
    69                 return dt;
    70             }
    71             catch (Exception exp)
    72             {
    73                 WriteErrorInformation(string.Format("Exception: {0}", exp.Message));
    74                 return null;
    75             }
    76         }
    #region[获取表格纯文本内容和富文本内容]
            static List<string> GetContent(Document doc, string Content, int contentIndex)
            {
                List<string> lst = new List<string>();
                string contentText = Content.Substring(contentIndex + 6);
                string contentHtml = contentText;
    
                int R = 0;//保存行索引
                int C = 0;//保存列索引
                //表格格式
                if (doc.Tables.Count > 0)
                {
                    string text = contentText;
                    //遍历<table>
                    for (int i = 1; i <= doc.Tables.Count; i++)
                    {
                        //读取到word文档中table的内容
                        string wordtable = doc.Tables[i].Range.Text;
                        string htmltable = "";
                        htmltable += "<table cellspacing='0' bordercolor='black' border='1' cellpadding='5' text-align='center'>";
                        //遍历行
                        for (int row = 1; row <= doc.Tables[i].Rows.Count; row++)
                        {
                            htmltable += "<tr>";
                            //遍历列
                            for (int column = 1; column <= doc.Tables[i].Columns.Count; column++)
                            {
                                htmltable += "<td>";
                                //R = getTableRowIndex(row, column, doc, i);
                                //C = getTablecolumnIndex(row, column, doc, i);
                                //htmltable += doc.Tables[i].Cell(R, C).Range.Text.Replace("
    ", "").Replace("a", "");
                                R = getTableRowIndex(row, column, doc, i);
                                C = getTablecolumnIndex(row, column, doc, i);
                                htmltable += doc.Tables[i].Cell(row, column).Range.Text.Replace("
    ", "").Replace("a", "");
                                htmltable += "</td>";
                                contentText += "  ";
    
                            }
                            htmltable += "</tr>";
                        }
                        htmltable += "</table>";
                        contentHtml = contentHtml.Replace(wordtable, htmltable);
                    }
                    contentHtml = "<p>" + contentHtml;
                    contentHtml = contentHtml.Replace("
    ", "</p><p>");
                    contentHtml += "</p>";
                }
                else//文本格式
                {
                    contentText = Content.Substring(contentIndex + 6);
                    contentHtml = "<p>";
                    contentHtml += contentText;
                    contentHtml = contentHtml.Replace("
    ", "</p><p>");
                    contentHtml += "</p>";
                }
                lst.Add(contentText);
                lst.Add(contentHtml);
                return lst;
            }
            #endregion
    #region[操作后给出提示信息]
            static void WriteErrorInformation(string errorInformation)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("Error: " + errorInformation);
                Console.ForegroundColor = ConsoleColor.Gray;
            }
            static void WriteOuputInformation(string outputInformation)
            {
                Console.ForegroundColor = ConsoleColor.DarkGreen;
                Console.WriteLine("-->>" + outputInformation);
                Console.ForegroundColor = ConsoleColor.Gray;
            }
            #endregion
  • 相关阅读:
    17.CSS 文本属性和字体属性
    15.CSS 浮动
    D. Same GCDs
    B. Infinite Prefixes
    D. 0-1 MST
    lambda表达式复习
    D. New Year and Conference
    C. New Year and Permutation
    D. Dr. Evil Underscores
    D. Minimax Problem
  • 原文地址:https://www.cnblogs.com/suflowers1700218/p/11676645.html
Copyright © 2011-2022 走看看