zoukankan      html  css  js  c++  java
  • 网页抓取小工具

    最近在线看电子书,由于篇幅太长,而且找不到下载地址,于是写了个小工具,将电子书下载到本地。

    整体思路:

    1、抓取出目录中各章节的名称及URL

    2、遍历章节URL,获取具体内容

    3、将章节URL进行分包,交给多线程处理

    4、将处理完的内容重新整理,按章节名称排序

    5、将内容写入TXT文件

    首先抓取导航页面的内容,通过WebRequest对象获取网页内容

    /// <summary>
    /// 通过链接地址获取HTML内容
    /// </summary>
    /// <param name="url"></param>
    /// <returns></returns>
    private static string GetHtml(string url)
    {
        string html = "";
        try
        {
            WebRequest request = WebRequest.Create(url);
            request.Credentials = CredentialCache.DefaultCredentials;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream stream = response.GetResponseStream();
            StreamReader reader = new StreamReader(stream, encoding);
    
            html = reader.ReadToEnd();
            reader.Close();
            stream.Close();
            response.Close();
        }
        catch
        {
                    
        }
    
        return html;
    }

    通过正则获取章节地址及名称

    /// <summary>
    /// 获取所有链接地址
    /// </summary>
    /// <param name="html"></param>
    private static Dictionary<string, string> GetAllUrl(string html)
    {
        string titlePattern = @"第(?<index>\d+)节";
        Dictionary<string, string> dictRet = new Dictionary<string, string>();
        string pattern = @"<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>";
        Regex reg = new Regex(pattern, RegexOptions.IgnoreCase);
        MatchCollection mc = reg.Matches(html);
        foreach (Match m in mc)
        {
            //将TITLE解析成第XXX节
            string title = NoHTML(m.Groups["text"].Value).Replace("\n\r", "").Replace(" ", "") ;
            Match mTitle = Regex.Match(title, titlePattern);
            if (!mTitle.Success)
            {
                continue;
            }
    
            title = string.Format("第{0}节", mTitle.Groups["index"].Value.PadLeft(3, '0'));
    
            string url = m.Groups["url"].Value;
            url = url.StartsWith("http://") ? url : string.Format("{0}/{1}", webUrl.TrimEnd('/'), url);
            if (!dictRet.ContainsKey(url))
            {
                dictRet.Add(url, title);
            }
        }
    
        return dictRet;
    }

    按线程数对地址进行分包处理

    /// <summary>
    /// 按线程数对URL进行分包处理
    /// </summary>
    /// <param name="dictUrls"></param>
    /// <returns></returns>
    private static Dictionary<int, Dictionary<string, string>> SplitUrl(Dictionary<string, string> dictUrls)
    {
        Dictionary<int, Dictionary<string, string>> dictRet = new Dictionary<int, Dictionary<string, string>>();
        int count = dictUrls.Count;
        int splitCount = count / threadCount;   //每包的大小
        int keyIndex = 0;
        int calCount = 0;
        foreach (string keyUrl in dictUrls.Keys)
        {
            if (calCount == splitCount && keyIndex < threadCount - 1)
            {
                keyIndex++;
                calCount = 0;
            }
    
            if (!dictRet.ContainsKey(keyIndex))
            {
                dictRet.Add(keyIndex, new Dictionary<string, string>());
            }
    
            dictRet[keyIndex].Add(keyUrl, dictUrls[keyUrl]);
            calCount++;
        }
    
        return dictRet;
    }

    开启线程,获取内容,存入字典

    static void Main(string[] args)
    {
        Dictionary<int, Dictionary<string, string>> dictSplitUrls = SplitUrl(dictUrls);
        for (int i = 0; i < threadCount; i++)
        {
            Dictionary<string, string> dictRun = dictSplitUrls[i];
            int index = i;
            ThreadStart ts = delegate { Run(dictRun, index); };
            new Thread(ts).Start();
        }
    }
    
    /// <summary>
    /// 运行内容解析
    /// </summary>
    /// <param name="dictUrls"></param>
    private static void Run(Dictionary<string, string> dictUrls, int threadId)
    {
        foreach (string keyUrl in dictUrls.Keys)
        {
            if (!dictContent.ContainsKey(dictUrls[keyUrl]))
            {
                dictContent.Add(dictUrls[keyUrl], "");
            }
    
            Console.WriteLine("Thread {0},deal with {1}:{2}...", threadId, dictUrls[keyUrl], keyUrl);
            string content = GetHtml(keyUrl);
            if (content.Length > 0)
            {
                content = AnalyseContent(content);
            }
    
            if (content.Length > 0)
            {
                dictContent[dictUrls[keyUrl]] = content;
                Write(content, string.Format("{0}_{1}.txt", systemName, dictUrls[keyUrl]));
                Console.WriteLine("success");
            }
            else
            {
                Console.WriteLine("failed");
            }
    
            lock (lockObj)
            {
                dealedCount++;
            }
        }
    
        if (dealedCount == totalCount)
        {
            WriteToTxt();
        }
    }

    最后将所有内容重新整理,输出到文本

    /// <summary>
    /// 输出到文本文件
    /// </summary>
    private static void WriteToTxt()
    {
        Console.WriteLine("获取内容完成,生成TXT");
    
        //字典按章节排序
        StringBuilder sbTxt = new StringBuilder();
        List<KeyValuePair<string, string>> lstOrder = dictContent.OrderBy(t => t.Key).ToList();
        StringBuilder sbContent = new StringBuilder();
        foreach (KeyValuePair<string, string> item in lstOrder)
        {
            Console.WriteLine("生成{0}", item.Key);
            sbContent.AppendFormat("{0} {1}", item.Key, item.Value);
        }
    
        Write(sbContent.ToString(), string.Format("{0}.txt", systemName));
        Console.WriteLine("生成TXT成功");
    }

    Demo

  • 相关阅读:
    Echarts图表常用功能配置,Demo示例
    Markdown 语法笔记
    EasyUI 通过 Combobox 实现 AutoComplete 效果
    python PIL Image基本的图片拼接、圆形裁减、添加文字
    Fiddler高级用法—Fiddler Script抓取app网页json数据并保存
    python elasticsearch环境搭建
    利用brich实现文本层次聚类,将文本内容分类
    python发送邮件带附件
    python-docx生成word文档
    python-pygal画图
  • 原文地址:https://www.cnblogs.com/FlySoul/p/2538947.html
Copyright © 2011-2022 走看看