zoukankan      html  css  js  c++  java
  • 将网页上word、pdf、txt文件下载下来,解析成文本内容

      1.第一步解析网页

      首先创建个HttpWebRequest,

    创建HttpWebRequest
     1   /// <summary>
    2 /// 创建HttpWebRequest
    3 /// </summary>
    4 /// <param name="url">访问路径</param>
    5 /// <param name="cookies">包含已验证用户信息的cookie</param>
    6 /// <returns></returns>
    7 public static HttpWebRequest CreateHttpWebRequest(string url, CookieCollection cookies)
    8 {
    9 HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
    10 webRequest.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
    11 webRequest.Accept = "*/*";
    12 webRequest.ContentType = "*/*";
    13 webRequest.Referer = "http://www.java.com.cn";
    14 webRequest.CookieContainer = new CookieContainer();
    15 if (cookies != null)
    16 webRequest.CookieContainer.Add(cookies);
    17 return webRequest;
    18
    19 }

      再得到它的

    HttpWebResponse res = (HttpWebResponse)req.GetResponse();
    

     2.判断网址内容是不是附件的形式,HttpWebResponse有个ContentType如果是application/x-msdownload,则说明网址内容是附件的形式。然后根据Headers就可以得到附件的格式(word、pdf或txt)

    网页内容格式
    if(res.ContentType == "application/x-msdownload")
    {
       string header = res.Headers.GetValues(0)[0]; //header  
       string type = header.Substring(header.Length - 3, 3) // 附件格式
    }

     3.然后下载附件

    下载文件
     1     /// <summary>
    2 /// 下载文件
    3 /// </summary>
    4 /// <param name="strHref">url地址</param>
    5 /// <param name="filePath">文件存放地址</param>
    6 public bool DownLoadFile(string strHref, string filePath)
    7 {
    8 HttpWebRequest hreq = null as HttpWebRequest;
    9 try
    10 {
    11
    12 hreq = (HttpWebRequest)HttpWebRequest.Create(strHref);
    13
    14 //延迟时间为10秒
    15 hreq.Timeout = 10 * 1000;
    16 hreq.Method = "GET";
    17
    18 HttpWebResponse hres = (HttpWebResponse)hreq.GetResponse();
    19
    20 MemoryStream memoryStream = new MemoryStream();
    21 byte[] buffer = new byte[0x100];
    22 Stream rs = hres.GetResponseStream();
    23
    24 for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
    25 {
    26 memoryStream.Write(buffer, 0, i);
    27 }
    28 rs.Close();
    29
    30 byte[] bufferWrite = memoryStream.ToArray();
    31 memoryStream.Close();
    32 memoryStream.Dispose();
    33
    34 File.WriteAllBytes(filePath, bufferWrite);
    35 return true;
    36 }
    37 catch (Exception ex)
    38 {
    39 return false;
    40 }
    41 finally
    42 {
    43 if (hreq != null)
    44 {
    45 hreq.Abort();
    46 }
    47 }
    48 return false;
    49 }

    4.解析下载下来的附件

      解析word,首先电脑上得装了office word;然后在引用中找到Microsoft.Office.Interop.Word这个引用

    解析word文件转换成txt文本
     private string GetTextFWord(string fileName)
    {
    string txtContent = string.Empty;
    try
    {
    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
    object file = fileName;
    object nullobj = System.Reflection.Missing.Value;
    Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(
    ref file, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj,
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
    txtContent = doc.Content.Text;//这里读取所有的文本
    doc.Close(ref nullobj, ref nullobj, ref nullobj);
    }
    catch
    {
    return string.Empty;
    }
    return txtContent;
    }

      

      解析pdf  ,用到的是第三方插件PDFBox-0.7.3.dll,大家可以去网上搜索

    pdf文件转换成文本内容
     1      private string GetTextFPDF(string fileName)
    2 {
    3 string txtContent = string.Empty;
    4 try
    5 {
    6 PDDocument doc = PDDocument.load(fileName);
    7 PDFTextStripper pdfStripper = new PDFTextStripper();
    8 // 设置换行符
    9 pdfStripper.setLineSeparator(Environment.NewLine);
    10 txtContent = pdfStripper.getText(doc);
    11 doc.close();
    12 }
    13 catch
    14 {
    15 return string.Empty; ;
    16 }
    17 return txtContent;
    18 }

      解析txt

    txt解析成文本内容
  • 相关阅读:
    oracle 日期和时间转换
    layui 分页 java后端封装
    excel 时间格式
    excel的编程VBA
    excel条件格式
    python列表变成字符串
    Django的ORM源码学习
    robot 源码解读6【元类和描述符类】
    @staticmethod
    python 类定义后调用名称也执行内部代码
  • 原文地址:https://www.cnblogs.com/lian9527/p/2313041.html
Copyright © 2011-2022 走看看