zoukankan      html  css  js  c++  java
  • 将网页上word、pdf、txt文件下载下来,解析成文本内容

      1.第一步解析网页

      首先创建个HttpWebRequest,

    创建HttpWebRequest
     1   /// <summary>
    2 /// 创建HttpWebRequest
    3 /// </summary>
    4 /// <param name="url">访问路径</param>
    5 /// <param name="cookies">包含已验证用户信息的cookie</param>
    6 /// <returns></returns>
    7 public static HttpWebRequest CreateHttpWebRequest(string url, CookieCollection cookies)
    8 {
    9 HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
    10 webRequest.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
    11 webRequest.Accept = "*/*";
    12 webRequest.ContentType = "*/*";
    13 webRequest.Referer = "http://www.java.com.cn";
    14 webRequest.CookieContainer = new CookieContainer();
    15 if (cookies != null)
    16 webRequest.CookieContainer.Add(cookies);
    17 return webRequest;
    18
    19 }

      再得到它的

    HttpWebResponse res = (HttpWebResponse)req.GetResponse();
    

     2.判断网址内容是不是附件的形式,HttpWebResponse有个ContentType如果是application/x-msdownload,则说明网址内容是附件的形式。然后根据Headers就可以得到附件的格式(word、pdf或txt)

    网页内容格式
    if(res.ContentType == "application/x-msdownload")
    {
       string header = res.Headers.GetValues(0)[0]; //header  
       string type = header.Substring(header.Length - 3, 3) // 附件格式
    }

     3.然后下载附件

    下载文件
     1     /// <summary>
    2 /// 下载文件
    3 /// </summary>
    4 /// <param name="strHref">url地址</param>
    5 /// <param name="filePath">文件存放地址</param>
    6 public bool DownLoadFile(string strHref, string filePath)
    7 {
    8 HttpWebRequest hreq = null as HttpWebRequest;
    9 try
    10 {
    11
    12 hreq = (HttpWebRequest)HttpWebRequest.Create(strHref);
    13
    14 //延迟时间为10秒
    15 hreq.Timeout = 10 * 1000;
    16 hreq.Method = "GET";
    17
    18 HttpWebResponse hres = (HttpWebResponse)hreq.GetResponse();
    19
    20 MemoryStream memoryStream = new MemoryStream();
    21 byte[] buffer = new byte[0x100];
    22 Stream rs = hres.GetResponseStream();
    23
    24 for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
    25 {
    26 memoryStream.Write(buffer, 0, i);
    27 }
    28 rs.Close();
    29
    30 byte[] bufferWrite = memoryStream.ToArray();
    31 memoryStream.Close();
    32 memoryStream.Dispose();
    33
    34 File.WriteAllBytes(filePath, bufferWrite);
    35 return true;
    36 }
    37 catch (Exception ex)
    38 {
    39 return false;
    40 }
    41 finally
    42 {
    43 if (hreq != null)
    44 {
    45 hreq.Abort();
    46 }
    47 }
    48 return false;
    49 }

    4.解析下载下来的附件

      解析word,首先电脑上得装了office word;然后在引用中找到Microsoft.Office.Interop.Word这个引用

    解析word文件转换成txt文本
     private string GetTextFWord(string fileName)
    {
    string txtContent = string.Empty;
    try
    {
    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
    object file = fileName;
    object nullobj = System.Reflection.Missing.Value;
    Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(
    ref file, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
    txtContent = doc.Content.Text;//这里读取所有的文本
    doc.Close(ref nullobj, ref nullobj, ref nullobj);
    }
    catch
    {
    return string.Empty;
    }
    return txtContent;
    }

      

      解析pdf  ,用到的是第三方插件PDFBox-0.7.3.dll,大家可以去网上搜索

    pdf文件转换成文本内容
     1      private string GetTextFPDF(string fileName)
    2 {
    3 string txtContent = string.Empty;
    4 try
    5 {
    6 PDDocument doc = PDDocument.load(fileName);
    7 PDFTextStripper pdfStripper = new PDFTextStripper();
    8 // 设置换行符
    9 pdfStripper.setLineSeparator(Environment.NewLine);
    10 txtContent = pdfStripper.getText(doc);
    11 doc.close();
    12 }
    13 catch
    14 {
    15 return string.Empty; ;
    16 }
    17 return txtContent;
    18 }

      解析txt

    txt解析成文本内容
  • 相关阅读:
    高斯拉普拉斯算子(Laplace of Gaussian)
    Windows的TCP协议参数
    poj 1182食物链(并查集)
    linux网络体系架构
    谈谈对于企业级系统架构的理解
    Redis源码解析(1)——源码目录介绍
    在多台服务器上简单实现Redis的数据主从复制
    利用Nginx做负载均衡
    C#中的BackgroundWorker控件
    C#中的线程(四)高级话题
  • 原文地址:https://www.cnblogs.com/lian9527/p/2313041.html
Copyright © 2011-2022 走看看