zoukankan      html  css  js  c++  java
  • 将网页上word、pdf、txt文件下载下来,解析成文本内容

      1.第一步解析网页

      首先创建个HttpWebRequest,

    创建HttpWebRequest
     1   /// <summary>
    2 /// 创建HttpWebRequest
    3 /// </summary>
    4 /// <param name="url">访问路径</param>
    5 /// <param name="cookies">包含已验证用户信息的cookie</param>
    6 /// <returns></returns>
    7 public static HttpWebRequest CreateHttpWebRequest(string url, CookieCollection cookies)
    8 {
    9 HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
    10 webRequest.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
    11 webRequest.Accept = "*/*";
    12 webRequest.ContentType = "*/*";
    13 webRequest.Referer = "http://www.java.com.cn";
    14 webRequest.CookieContainer = new CookieContainer();
    15 if (cookies != null)
    16 webRequest.CookieContainer.Add(cookies);
    17 return webRequest;
    18
    19 }

      再得到它的

    HttpWebResponse res = (HttpWebResponse)req.GetResponse();
    

     2.判断网址内容是不是附件的形式,HttpWebResponse有个ContentType如果是application/x-msdownload,则说明网址内容是附件的形式。然后根据Headers就可以得到附件的格式(word、pdf或txt)

    网页内容格式
    if(res.ContentType == "application/x-msdownload")
    {
       string header = res.Headers.GetValues(0)[0]; //header  
       string type = header.Substring(header.Length - 3, 3) // 附件格式
    }

     3.然后下载附件

    下载文件
     1     /// <summary>
    2 /// 下载文件
    3 /// </summary>
    4 /// <param name="strHref">url地址</param>
    5 /// <param name="filePath">文件存放地址</param>
    6 public bool DownLoadFile(string strHref, string filePath)
    7 {
    8 HttpWebRequest hreq = null as HttpWebRequest;
    9 try
    10 {
    11
    12 hreq = (HttpWebRequest)HttpWebRequest.Create(strHref);
    13
    14 //延迟时间为10秒
    15 hreq.Timeout = 10 * 1000;
    16 hreq.Method = "GET";
    17
    18 HttpWebResponse hres = (HttpWebResponse)hreq.GetResponse();
    19
    20 MemoryStream memoryStream = new MemoryStream();
    21 byte[] buffer = new byte[0x100];
    22 Stream rs = hres.GetResponseStream();
    23
    24 for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
    25 {
    26 memoryStream.Write(buffer, 0, i);
    27 }
    28 rs.Close();
    29
    30 byte[] bufferWrite = memoryStream.ToArray();
    31 memoryStream.Close();
    32 memoryStream.Dispose();
    33
    34 File.WriteAllBytes(filePath, bufferWrite);
    35 return true;
    36 }
    37 catch (Exception ex)
    38 {
    39 return false;
    40 }
    41 finally
    42 {
    43 if (hreq != null)
    44 {
    45 hreq.Abort();
    46 }
    47 }
    48 return false;
    49 }

    4.解析下载下来的附件

      解析word,首先电脑上得装了office word;然后在引用中找到Microsoft.Office.Interop.Word这个引用

    解析word文件转换成txt文本
     private string GetTextFWord(string fileName)
    {
    string txtContent = string.Empty;
    try
    {
    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
    object file = fileName;
    object nullobj = System.Reflection.Missing.Value;
    Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(
    ref file, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
    txtContent = doc.Content.Text;//这里读取所有的文本
    doc.Close(ref nullobj, ref nullobj, ref nullobj);
    }
    catch
    {
    return string.Empty;
    }
    return txtContent;
    }

      

      解析pdf  ,用到的是第三方插件PDFBox-0.7.3.dll,大家可以去网上搜索

    pdf文件转换成文本内容
     1      private string GetTextFPDF(string fileName)
    2 {
    3 string txtContent = string.Empty;
    4 try
    5 {
    6 PDDocument doc = PDDocument.load(fileName);
    7 PDFTextStripper pdfStripper = new PDFTextStripper();
    8 // 设置换行符
    9 pdfStripper.setLineSeparator(Environment.NewLine);
    10 txtContent = pdfStripper.getText(doc);
    11 doc.close();
    12 }
    13 catch
    14 {
    15 return string.Empty; ;
    16 }
    17 return txtContent;
    18 }

      解析txt

    txt解析成文本内容
  • 相关阅读:
    poj_1236 强连通分支
    【winform程序】自定义webrowser控件调用IE的版本
    【小程序开发】微信小程序开发中遇到的那些坑...
    【C#多线程】C#多线程 Thread 开发基础
    【管理心得】不懂带人,你就自己干到死
    【80端口占用】win7下80端口被(Pid=4)占用的解决方法
    【顽固BUG】Visual Studio 2015 + TestDriven.NET-3.8.2860_Personal_Beta 调用的目标发生了异常。
    【HPP开发】让所有中小企业拥有自己的APP
    【创业积累】如何快速开发出一个高质量的APP
    【架构师之路】依赖注入原理---IoC框架
  • 原文地址:https://www.cnblogs.com/lian9527/p/2313041.html
Copyright © 2011-2022 走看看