zoukankan      html  css  js  c++  java
  • 蜘蛛爬虫

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Net;
    using System.Text.RegularExpressions;

    namespace ConsoleApplication1
    {
        class Program
        {
            static void Main(string[] args)
            {
                //需要解析的集合
                List<string> list = new List<string>();
                //已经解析的集合
                List<string> listCount = new List<string>();
               
                list.Add("http://www.baidu.com");

                ReadHtml(list,listCount);
                Console.ReadLine();
            }

            /// <summary>
            /// 读取HTML中的URL
            /// </summary>
            /// <param name="list"></param>
            /// <param name="listCount"></param>
            public static void ReadHtml(List<string> list, List<string> listCount)
            {
                List<string> count = new List<string>();
                for (int a = 0; a < list.Count; a++)
                {
                    //没有解析过该项
                    if (!listCount.Contains(list[a]))
                    {
                        try
                        {
                            //在已解析过的集合里面添加本条数据
                            listCount.Add(list[a]);
                            WebRequest req = WebRequest.Create(list[a]);
                            WebResponse result = req.GetResponse();
                            //得到的流是网页内容  
                            Stream ReceiveStream = result.GetResponseStream();
                            StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312"));
                            //得到当前URL的源码
                            string str = readerOfStream.ReadToEnd();
                            //解析
                            Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
                            foreach (Match mc in regex.Matches(str))
                            {
                                Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml");
                                if (!regexOhter.IsMatch(mc.ToString()))
                                {
                                    Console.WriteLine(mc);
                                    count.Add(mc.ToString());
                                }
                            }
                            Console.WriteLine("----------------------解析完一个页面!--------------------");
                            if (a == list.Count - 1)
                            {
                                //递归调用本方法
                                ReadHtml(count, listCount);
                            }
                        }
                        catch (System.Exception ex) { }
                        finally
                        {
                            List<string> error = new List<string>();
                            //如果出错在出错的后面一条URL继续解析
                            for (int z = a + 1; z < list.Count; z++)
                            {
                                error.Add(list[z]);
                            }
                            //继续解析
                            ReadHtml(error, listCount);
                        }
                    }
                }
            }
        }
    }

  • 相关阅读:
    python操作adb代码
    android sdcard 权限管理策略研究
    glom模块的使用(二)
    爬虫错误汇总
    微博展开全文获取
    数据清洗之微博内容清洗
    Esxi5-管理平台vcenter5.0_数据库迁移流程
    migrating-vcenter-database-express-to-sql-2008-r2
    Centos生成SSL证书的步骤
    找到一篇关于2.4/5G信道的新介绍
  • 原文地址:https://www.cnblogs.com/java20130723/p/3211470.html
Copyright © 2011-2022 走看看