zoukankan      html  css  js  c++  java
  • 正则表达式实例:取得普陀区所有的小区名字和地址

    程序就是个好东西,人很难完成的任务,它只需很短时间就搞定。

    下面我们来采集一个房产网站上的所有普陀区的小区列表

    改地址为:http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA

    {0}为页码,共35页,C#实现代码如下:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Text.RegularExpressions;
    
    namespace Hourse
    {
        class Program
        {
            private static string uri;
            private static string file;
            static void Main(string[] args)
            {
                uri = "http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA";
                file = AppDomain.CurrentDomain.BaseDirectory + "data.txt";
    
                if (!File.Exists(file)) File.Create(file);
                Console.WriteLine("--------------------------");
                Console.WriteLine("开始采集数据,请等待...");
                Console.WriteLine("--------------------------");
                int pages = 35;
                int counts = 0;
                for (int i = 1; i <= pages; i++)
                {
                    counts += OperateInfo(i);
                }
                Console.WriteLine("采集完成!共"+counts+"条,文件存放在"+file);
                Console.ReadKey();
            }
            static int OperateInfo(int page)
            {
                string _uri = uri.Replace("{0}", page.ToString());
                
                WebClient client = new WebClient();
                byte[] datas= client.DownloadData(_uri);
                string txt = Encoding.UTF8.GetString(datas);
                
                /*
                string txt=@"
                <div class=""fsize14 margin-bottom8"">
                                	<strong>
                                	<a href=""/xiaoqu-4796-%E6%9B%B9%E6%9D%A8%E4%BA%8C%E6%9D%91"" target=""_blank"">
                                	曹杨二村</a>
    		                        </strong>
                                </div>
                                <div class=""margin-bottom5"">
                                	普陀区 
                                					曹杨路1107弄,</div>
                ";
                */
    
                //匹配小区列表
                string pattern = "<div class=\"fsize14 margin-bottom8\">\\s+<strong>\\s+<a\\s+[^>]+>\\s+(.+?)</a>\\s+</strong>"+
                                "\\s+</div>\\s+<div class=\"margin-bottom5\">([^<]+)</div>";
                //获取所有的匹配
                string name, address; //小区名字和地址
                MatchCollection mc = Regex.Matches(txt, pattern);
                foreach (Match m in mc)
                {
                    name = Regex.Replace(m.Value, pattern, "$1");
                    address = Regex.Replace(m.Value, pattern, "$2");
                    address = Regex.Replace(address, "[\\s,( )]+", "");
                    Save(name+" "+address);
                }
                Console.WriteLine("第" + page + "页采集到" + mc.Count + "条!");
                return mc.Count;
            }
            static void Save(string str)
            {
                using (StreamWriter sw = new StreamWriter(file, true, Encoding.UTF8))
                {
                    sw.WriteLine(str);
                    sw.Flush();
                }
            }
        }
    }
    

    运行程序:

    DEMO下载

    原创内容请您保留出处及地址 , 主页:展益
  • 相关阅读:
    关于Snoop的用法
    WPF中Xaml编译正常而Designer Time时出错的解决办法
    TFS自定义开发中的反射应用
    用Fiddler2来监听HTTP(记:用skydrive sdk访问时,出错后用Fidder抓包分析)
    Send Code to evernote by my specify notebook
    Sublime Text2中Evernote 插件的使用
    编译Python出现Tab,空格的问题
    Python 中list, dictionary 与 file相互操作
    import module, from module import funtion区别
    C# 性能总结
  • 原文地址:https://www.cnblogs.com/newmin/p/1992015.html
Copyright © 2011-2022 走看看