zoukankan      html  css  js  c++  java
  • 将网页上word、pdf、txt文件下载下来,解析成文本内容

      1.第一步解析网页

      首先创建个HttpWebRequest,

    创建HttpWebRequest
     1   /// <summary>
    2 /// 创建HttpWebRequest
    3 /// </summary>
    4 /// <param name="url">访问路径</param>
    5 /// <param name="cookies">包含已验证用户信息的cookie</param>
    6 /// <returns></returns>
    7 public static HttpWebRequest CreateHttpWebRequest(string url, CookieCollection cookies)
    8 {
    9 HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
    10 webRequest.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
    11 webRequest.Accept = "*/*";
    12 webRequest.ContentType = "*/*";
    13 webRequest.Referer = "http://www.java.com.cn";
    14 webRequest.CookieContainer = new CookieContainer();
    15 if (cookies != null)
    16 webRequest.CookieContainer.Add(cookies);
    17 return webRequest;
    18
    19 }

      再得到它的

    HttpWebResponse res = (HttpWebResponse)req.GetResponse();
    

     2.判断网址内容是不是附件的形式,HttpWebResponse有个ContentType如果是application/x-msdownload,则说明网址内容是附件的形式。然后根据Headers就可以得到附件的格式(word、pdf或txt)

    网页内容格式
    if(res.ContentType == "application/x-msdownload")
    {
       string header = res.Headers.GetValues(0)[0]; //header  
       string type = header.Substring(header.Length - 3, 3) // 附件格式
    }

     3.然后下载附件

    下载文件
     1     /// <summary>
    2 /// 下载文件
    3 /// </summary>
    4 /// <param name="strHref">url地址</param>
    5 /// <param name="filePath">文件存放地址</param>
    6 public bool DownLoadFile(string strHref, string filePath)
    7 {
    8 HttpWebRequest hreq = null as HttpWebRequest;
    9 try
    10 {
    11
    12 hreq = (HttpWebRequest)HttpWebRequest.Create(strHref);
    13
    14 //延迟时间为10秒
    15 hreq.Timeout = 10 * 1000;
    16 hreq.Method = "GET";
    17
    18 HttpWebResponse hres = (HttpWebResponse)hreq.GetResponse();
    19
    20 MemoryStream memoryStream = new MemoryStream();
    21 byte[] buffer = new byte[0x100];
    22 Stream rs = hres.GetResponseStream();
    23
    24 for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
    25 {
    26 memoryStream.Write(buffer, 0, i);
    27 }
    28 rs.Close();
    29
    30 byte[] bufferWrite = memoryStream.ToArray();
    31 memoryStream.Close();
    32 memoryStream.Dispose();
    33
    34 File.WriteAllBytes(filePath, bufferWrite);
    35 return true;
    36 }
    37 catch (Exception ex)
    38 {
    39 return false;
    40 }
    41 finally
    42 {
    43 if (hreq != null)
    44 {
    45 hreq.Abort();
    46 }
    47 }
    48 return false;
    49 }

    4.解析下载下来的附件

      解析word,首先电脑上得装了office word;然后在引用中找到Microsoft.Office.Interop.Word这个引用

    解析word文件转换成txt文本
     private string GetTextFWord(string fileName)
    {
    string txtContent = string.Empty;
    try
    {
    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
    object file = fileName;
    object nullobj = System.Reflection.Missing.Value;
    Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(
    ref file, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
    txtContent = doc.Content.Text;//这里读取所有的文本
    doc.Close(ref nullobj, ref nullobj, ref nullobj);
    }
    catch
    {
    return string.Empty;
    }
    return txtContent;
    }

      

      解析pdf  ,用到的是第三方插件PDFBox-0.7.3.dll,大家可以去网上搜索

    pdf文件转换成文本内容
     1      private string GetTextFPDF(string fileName)
    2 {
    3 string txtContent = string.Empty;
    4 try
    5 {
    6 PDDocument doc = PDDocument.load(fileName);
    7 PDFTextStripper pdfStripper = new PDFTextStripper();
    8 // 设置换行符
    9 pdfStripper.setLineSeparator(Environment.NewLine);
    10 txtContent = pdfStripper.getText(doc);
    11 doc.close();
    12 }
    13 catch
    14 {
    15 return string.Empty; ;
    16 }
    17 return txtContent;
    18 }

      解析txt

    txt解析成文本内容
  • 相关阅读:
    _STORAGE_WRITE_ERROR_:./Application/Runtime/Cache/Home/f8995a0e1afcdadc637612fae5a3b585.php
    git 报错:没有权限 remote: error: unable to unlink old 'README.md' (Permission denied)
    深度学习入门实战(二)-用TensorFlow训练线性回归
    一条SQL搞定信息增益的计算
    Vue.js动画在项目使用的两个示例
    腾讯云安全:开发者必看|Android 8.0 新特性及开发指南
    腾讯云上Selenium用法示例
    一个只有99行代码的JS流程框架
    腾讯云上PhantomJS用法示例
    前端开发框架简介:angular和react
  • 原文地址:https://www.cnblogs.com/lian9527/p/2313041.html
Copyright © 2011-2022 走看看