zoukankan      html  css  js  c++  java
  • C# 实现抓取网页内容(一)

    一、窗体应用程序界面:

    二、上源码:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    using System.Windows.Forms;

    namespace WebCatchTest0911
    {
    public partial class Form1 : Form
    {
    public Form1()
    {
    InitializeComponent();
    }
    public static CookieCollection CC = new CookieCollection();
    private void btn_Start_Click(object sender, EventArgs e)
    {
    string str = GetWebPageSource(textBox1.Text.Trim());
    }

    public static string GetWebPageSource(string Url)
    {
    if (Url.Contains("about"))
    {
    Url = Url.Replace("about", "http");
    }
    try
    {
    //http://brand.tmall.com/brandMap.htm
    HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
    MyRequest.Method = "GET";
    MyRequest.Headers.Add("Accept-Encoding", "GBK");
    MyRequest.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
    MyRequest.Headers.Add("Cache-Control", "max-age=0");
    MyRequest.KeepAlive = true;
    MyRequest.Host = "www.icoolbr.com";
    MyRequest.ProtocolVersion = HttpVersion.Version11;
    MyRequest.ContentType = "text/html; charset=GBK";
    MyRequest.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36";
    MyRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
    MyRequest.AllowAutoRedirect = true;
    MyRequest.CookieContainer = new CookieContainer();
    MyRequest.CookieContainer.Add(CC);
    HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();
    StreamReader srd = new StreamReader(MyResponse.GetResponseStream(), Encoding.GetEncoding("GBK"));
    string txt = srd.ReadToEnd();
    CC = MyResponse.Cookies;
    srd.Close();
    srd.Dispose();
    return txt;
    }
    catch { return ""; }
    }
    }
    }

    三、总结

    1)、HttpWebRequest的参数可以通过浏览器查看(F12);

    2)、注意释放资源;

    四、下章实现提取网页内容

  • 相关阅读:
    Socket通信
    浏览器调用打印机
    python dict操作
    python list操作
    python 模块Example链接
    python random模块
    python configparser模块
    python unittest模块
    python timeit模块
    python datetime模块
  • 原文地址:https://www.cnblogs.com/czqbk/p/4801605.html
Copyright © 2011-2022 走看看