zoukankan      html  css  js  c++  java
  • 简简单单C#爬虫小计

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading.Tasks;
    
    namespace 正则
    {
        class Program
        {
            static void Main(string[] args)
            {
                string url = "http://www.admin5.com/browse/177/";
                string html = GetHtml(url, Encoding.UTF8);
                Regex r = new Regex("(?<=href=").*?(?=")");
                MatchCollection mc = r.Matches(html);
                int a = 1;
                foreach (Match m in mc)
                {
                    if (m.Value.Contains("article"))
                    {
                        Console.WriteLine("http://www.admin5.com/" + m.Value);
                        Console.WriteLine("抓取内容");
                        string content = GetHtml(m.Value, Encoding.UTF8);
                        Regex i = new Regex("(?<=title>).*?(?=</title>)");
                        MatchCollection mm = i.Matches(content);
                        Regex rcontent = new Regex("<div class="content">[\s\S]*?</div>");
                        MatchCollection nr = rcontent.Matches(content);
                        string title = mm[0].Value;
                        string neirong = nr[0].Value;
                        Console.WriteLine("保存数据");
                        string path = Directory.GetCurrentDirectory();
                        if (!Directory.Exists(path + "\data"))
                        {
                            Directory.CreateDirectory(path + "\data");
                        }
                        File.WriteAllText(path + "\data" + "\" + a + ".txt", title + "
    " + neirong);
                        a++;
                        Console.WriteLine("保存成功");
                    }
                }
                Console.WriteLine("ok");
                Console.ReadKey();
            }
    
            private static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream s = response.GetResponseStream();
                StreamReader sr = new StreamReader(s);
                return sr.ReadToEnd();
            }
        }
    }
    

      

    谢谢你长得这么好看还来看我的博客!
  • 相关阅读:
    C++内联函数
    C++类中创建线程
    windows下搭建Redis集群
    tcpdump截帧工具使用
    使用gdb调试应用程序
    工作之用
    primecoin服务常用命令和参数说明
    Windows mysql默认字符集修改
    primecoin在ubuntu16.04上部署服务:
    ubuntu磁盘分配和挂载
  • 原文地址:https://www.cnblogs.com/hexd1230/p/4781526.html
Copyright © 2011-2022 走看看