zoukankan      html  css  js  c++  java
  • c# 拉取网页

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using System.Net;
    using System.IO;
    
    
    namespace xsharp
    {
        class Program
        {
    
            static string sDir = "G:\notex\";
            static WebClient MyWebClient = new WebClient();
            static string sMain = "";
    
            static int writeContent(string sHtml, int idx)
            {
                int iBgnIdx = sHtml.IndexOf("<meta name="keywords" content="");
                if (iBgnIdx <= 0)
                    return -1;
                int iEndIdx = sHtml.IndexOf(""", iBgnIdx + 31);
                string sTitle = sHtml.Substring(iBgnIdx + 31, iEndIdx - iBgnIdx - 31);
                Console.WriteLine(sTitle);
    
                iBgnIdx = sHtml.IndexOf("<div id="chaptercontent");
                if (iBgnIdx <= 0)
                    return -1;
                iEndIdx = sHtml.IndexOf("</div>", iBgnIdx + 1);
                if (iEndIdx <= 0)
                    return -1;
    
                string sDivSub = sHtml.Substring(iBgnIdx, iEndIdx - iBgnIdx);
    
                iBgnIdx = sDivSub.IndexOf("</p>");
                if (iBgnIdx <= 0)
                    return -1;
                iEndIdx = sDivSub.IndexOf("<p style", iBgnIdx + 5);
                if (iEndIdx <= 0)
                    return -1;
                string sContentSub = sDivSub.Substring(iBgnIdx + 5, iEndIdx - iBgnIdx - 5);
                sContentSub = sContentSub.Replace(" ", " ");
                sContentSub = sContentSub.Replace("<br />", Environment.NewLine);
    
                Console.WriteLine(sContentSub);
                string sHtmlPath = sDir + idx.ToString() + ".html";
                using (StreamWriter sw = new StreamWriter(sHtmlPath))//将获取的内容写入文本
                {
                    sw.Write(sContentSub);
                }
                return 0;
            }
    
            static int downPage(string sUrl, ref string pageHtml)
            {
                Byte[] pageData = MyWebClient.DownloadData(sUrl); //从指定网站下载数据
                //string pageHtml = Encoding.Default.GetString(pageData);  //如果获取网站页面采用的是GB2312,则使用这句   
                pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
                                                              //Console.WriteLine(pageHtml);//在控制台输入获取的内容
                return 0;
            }
    
            static string getNextPageUrl(string se)
            {
                int iLastPage = se.IndexOf("下一页</a>");
                string sCut = se.Substring(iLastPage-12, 12);
                if (sCut.CompareTo("="disabled">") == 0)      //最后一页
                    return "";
    
                int iHrefBgn = se.LastIndexOf("a href=", iLastPage);
                if (iHrefBgn <= 0)
                    return "";
    
                iHrefBgn = iHrefBgn + 8;
                iLastPage = se.IndexOf('"', iHrefBgn);
                if (iLastPage <= 0)
                    return "";
    
                iHrefBgn = se.LastIndexOf('/', iLastPage);
                if (iHrefBgn <= 0)
                    return "";
    
                return se.Substring(iHrefBgn, iLastPage - iHrefBgn);
            }
    
            static int getContentUrl(string sKeys, ref string sHref, ref string sTitle)
            {
                int iHrefBgn = sKeys.IndexOf("a href=");
                if (iHrefBgn <= 0)
                    return -1;
    
                iHrefBgn = iHrefBgn + 8;
                int iLastPage = sKeys.IndexOf('"', iHrefBgn);
                if (iLastPage <= 0)
                    return -2;
    
                sHref = sKeys.Substring(iHrefBgn, iLastPage - iHrefBgn);
    
                iHrefBgn = iLastPage + 2;
                iLastPage = sKeys.IndexOf("</a>");
                if (iLastPage <= 0)
                    return -3;
    
                sTitle = sKeys.Substring(iHrefBgn, iLastPage - iHrefBgn);
                return 0;
            }
    
            static int dealIndexPage(string sIndexUrl)
            {
                string pageHtml = "";
                downPage(sMain + sIndexUrl, ref pageHtml);
    
                string sRecommendKey = "<div class="recommend">";
                int iBgnIdx = pageHtml.IndexOf(sRecommendKey);
                if (iBgnIdx <= 0)
                    return -1;
                iBgnIdx = pageHtml.IndexOf(sRecommendKey, iBgnIdx + 10);
                if (iBgnIdx <= 0)
                    return -2;
    
                string sNoteKey = "<p class="note">";
                int iEndIdx = pageHtml.IndexOf(sNoteKey, iBgnIdx + 10);
                if (iEndIdx <= 0)
                    return -3;
    
                string sHrefArray = pageHtml.Substring(iBgnIdx, iEndIdx - iBgnIdx);
                //sHrefArray.Split(new string[] { "\r\n", });
                string[] sTmpArray = sHrefArray.Split("
    ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                foreach (string se in sTmpArray)
                {
                    if (se.IndexOf("a href=") > 0)
                    {
                        Console.Write(se + "
    ");
                        int iLastPage = se.IndexOf("下一页</a>");
                        if (iLastPage > 0)          //下一页
                        {
                            string sNext = getNextPageUrl(se);
                            if (sNext != "")
                            {
                                Console.Write("nextpage  "+sNext+"  
    ");
                                dealIndexPage(sNext);
                            }
                            else
                                Console.Write("Finish..........
    ");
                        }
                        else                        //普通链接页
                        {
                            string sUrl = "";
                            string sTitle = "";
                            if ( 0 == getContentUrl(se, ref sUrl, ref sTitle) )
                                Console.Write("   "+ sUrl + "  " + sTitle + "
    ");
                            else
                                Console.Write("   deal...error 
    ");
                        }
                    }
                }
                Console.Write("aaaaaaaaaaaaa
    ");
                return 0;
            }
    
            static void write2File(ref string pageContext, ref string sPath)
            {
                using (StreamWriter sw = new StreamWriter(sPath))//将获取的内容写入文本
                {
                    sw.Write(pageContext);
                }
            }
    
            static void Main(string[] args)
            {
                try
                {
                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。
                    //string pageHtml = "";
                    //downPage("http://wap.xxbiquge.com/59_59865/index_49.html", ref pageHtml);
                    //downPage("http://wap.xxbiquge.com/59_59865/3184122.html", ref pageHtml);
                    sMain = "http://wap.xxbiquge.com/59_59865/";
                    dealIndexPage("");
                    //write2File(pageHtml, "H:\page\ouput.html");
                    //writeContent(pageHtml, 0);
                    
                }
                catch (WebException webEx)
                {
                    Console.WriteLine(webEx.Message.ToString());
                }
    
    
                Console.ReadLine(); //让控制台暂停,否则一闪而过了
            }
        }
    }




    ///<summary>
    /// 序列化
    /// </summary>
    /// <param name="data">要序列化的对象</param>
    /// <returns>返回存放序列化后的数据缓冲区</returns>
    public byte[] Serialize(object data)
    {
    BinaryFormatter formatter = new BinaryFormatter();
    MemoryStream rems = new MemoryStream();
    formatter.Serialize(rems, data);
    return rems.GetBuffer();
    }

    /// <summary>
    /// 反序列化
    /// </summary>
    /// <param name="data">数据缓冲区</param>
    /// <returns>对象</returns>
    public object Deserialize(byte[] data)
    {
    BinaryFormatter formatter = new BinaryFormatter();
    MemoryStream rems = new MemoryStream(data);
    data = null;
    object obj = new object();
    try
    {
    obj = formatter.Deserialize(rems);
    }
    catch (Exception ex)
    {
    Console.Write("BaseAction序列化bug:" + ex.ToString());
    }
    return obj;
    }

      

  • 相关阅读:
    python入门_老男孩_文件操作
    python入门_老男孩_列表和字典循环删除的正确方法
    python入门_老男孩_集合_元祖
    linux入门_韩顺平_复习版_文件目录类
    python-re模块
    sorted()函数
    偏函数+高阶函数
    生成器
    闭包
    匿名函数
  • 原文地址:https://www.cnblogs.com/yylingyao/p/7162059.html
Copyright © 2011-2022 走看看