zoukankan      html  css  js  c++  java
  • 怎么写爬虫,怎么找网站练手抓取链家、中原、安居客、我爱我家,今年5月份开始写论文啦!!!

    //设置请求时间

    string html = string.Empty;
    try
    {
    HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
    request.Timeout = 30 * 1000;//设置30s的超时
    request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
    request.ContentType = "text/html; charset=utf-8";
    using (HttpWebResponse response = request.GetResponse() as HttpWebResponse) //发起请求
    {
    if (response.StatusCode != HttpStatusCode.OK)
    {
    log.Error("抓取{0}地址返回失败,响应状态为{1}", url, response.StatusCode);
    }
    else
    {
    try
    {
    StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
    html = sr.ReadToEnd();//读取数据
    sr.Close();
    }
    catch (Exception ex)
    {
    log.Error("抓取{0}失败", url, ex);
    html = null;
    }
    }
    }
    }
    catch (Exception ex)
    {
    log.Error("抓取{0}出现异常", url, ex);
    html = null;
    }
    return html;

    //抓取链家、中原、安居客、我爱我家

    string html = HttpHelper.DownloadUrl(pageurl);
    if (html == null)
    {
    return houseList;
    }
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    //链家
    string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
    HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    if (noneNodeList == null)
    {
    log.ErrorAsync("数据为空!");
    return houseList;
    }
    foreach (var item in noneNodeList)
    {
    TrojanHorse house = new TrojanHorse();
    HtmlDocument docChild = new HtmlDocument();
    docChild.LoadHtml(item.OuterHtml);
    //链家
    string urlPath = @"//*[@class='info']/div[@class='title']/a";
    HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    string tsct = urlNode.InnerText;//小区名称
    //链家 [@class='xiaoquListItemRight']/div
    string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
    HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    string s = "";
    if (urlNodes == null)
    {
    s = "null";
    }
    else
    {
    s = urlNodes.InnerText;
    }
    //链家
    string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    string st = "";
    if (urlNodest == null)
    {
    st = "null";
    }
    else
    {
    st = urlNodest.InnerText;
    }
    #region
    //string tscts = s.Replace(" ", "");
    //string tsctst = tscts.Substring(0, 8);
    //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // string rsf = s;
    //string zf = tsctsb.Substring(0, 5);//租房
    #endregion
    house.title = tsct;
    house.price = s;
    house.remark = st;
    houseList.Add(house);
    #region
    //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    //if (companyNode == null)
    //{
    // continue;
    //}
    //house.Company = companyNode.InnerText; //中介公司
    //string telPath = "//*[@class='jjr-side']";
    //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    //if (telNode == null)
    //{
    // continue;
    //}
    //string telstr = telNode.InnerText.Trim();
    //house.Mobile = telstr; //经纪人电话
    //house.CityCode = citycode; //城市代号
    //house.CreateTime = DateTime.Now;
    //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    //if (flag == null)
    //{
    // houseList.Add(house);
    //}
    #endregion
    }
    }

    /// <summary>
    /// 抓取每一页的数据
    /// </summary>
    /// <param name="pageurl"></param>
    /// <returns></returns>
    private static List<TrojanHorse> GetTrojanHorseList(string pageurl)
    {
    List<TrojanHorse> houseList = new List<TrojanHorse>();
    try
    {
    string html = HttpHelper.DownloadUrl(pageurl);
    if (html == null)
    {
    return houseList;
    }
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    //链家
    string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
    HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    if (noneNodeList == null)
    {
    log.ErrorAsync("数据为空!");
    return houseList;
    }
    foreach (var item in noneNodeList)
    {
    TrojanHorse house = new TrojanHorse();
    HtmlDocument docChild = new HtmlDocument();
    docChild.LoadHtml(item.OuterHtml);
    //链家
    string urlPath = @"//*[@class='info']/div[@class='title']/a";
    HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    string tsct = urlNode.InnerText;//小区名称
    //链家 [@class='xiaoquListItemRight']/div
    string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
    HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    string s = "";
    if (urlNodes == null)
    {
    s = "null";
    }
    else
    {
    s = urlNodes.InnerText;
    }
    //链家
    string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    string st = "";
    if (urlNodest == null)
    {
    st = "null";
    }
    else
    {
    st = urlNodest.InnerText;
    }
    #region
    //string tscts = s.Replace(" ", "");
    //string tsctst = tscts.Substring(0, 8);
    //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // string rsf = s;
    //string zf = tsctsb.Substring(0, 5);//租房
    #endregion
    house.title = tsct;
    house.price = s;
    house.remark = st;
    houseList.Add(house);
    #region
    //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    //if (companyNode == null)
    //{
    // continue;
    //}
    //house.Company = companyNode.InnerText; //中介公司
    //string telPath = "//*[@class='jjr-side']";
    //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    //if (telNode == null)
    //{
    // continue;
    //}
    //string telstr = telNode.InnerText.Trim();
    //house.Mobile = telstr; //经纪人电话
    //house.CityCode = citycode; //城市代号
    //house.CreateTime = DateTime.Now;
    //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    //if (flag == null)
    //{
    // houseList.Add(house);
    //}
    #endregion
    }
    }


    #region
    // {
    // string html = HttpHelper.DownloadUrl(pageurl);
    // if (html == null)
    // {
    // return houseList;
    // }
    // HtmlDocument doc = new HtmlDocument();
    // doc.LoadHtml(html);
    // //*[@id="pebpwbwege"]
    // //链家
    // //string psht = @"//*[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";

    // //中原
    // string psht = @"//*[@class='section-wrap section-houselists']/div[@class='section']/div[@class='house-item clearfix']";
    // HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    // if (noneNodeList == null)
    // {
    // log.ErrorAsync("数据为空!");
    // return houseList;
    // }
    // foreach (var item in noneNodeList)
    // {
    // TrojanHorse house = new TrojanHorse();
    // HtmlDocument docChild = new HtmlDocument();
    // docChild.LoadHtml(item.OuterHtml);
    // //docChild.LoadHtml();
    // //链家
    // //string urlPath = @"//*[@class='info']/div[@class='title']/a";
    // //中原
    // string urlPath = @"//*[@class='item-info fl']/h4/a";
    // HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    // //docChild.DocumentNode.SelectSingleNode(str);
    // string tsct = urlNode.InnerText;//小区名称
    // //链家
    // //string strs = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";

    // //中原
    // string strs = @"//*[@class='item-pricearea fr']/p[@class='tc f666 f12 mt_10']/a";
    // HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    // string s = "";
    // if (urlNodes == null)
    // {
    // s = "null";
    // }
    // else
    // {
    // s = urlNodes.InnerText;

    // }
    // //链家
    // //string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    // //中原
    // string strst = @"//*[@class='item-pricearea fr']/p[@class='price-nub cRed tc']/span";
    // HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    // string st = "";
    // if (urlNodest == null)
    // {
    // st = "null";
    // }
    // else
    // {
    // st = urlNodest.InnerText;

    // }

    // //string tscts = s.Replace(" ", "");
    // //string tsctst = tscts.Substring(0, 8);
    // //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // // string rsf = s;
    // //string zf = tsctsb.Substring(0, 5);//租房
    // house.title = tsct;
    // house.price = s;
    // house.remark = st;
    // houseList.Add(house);

    // //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    // //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    // //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    // //if (companyNode == null)
    // //{
    // // continue;
    // //}
    // //house.Company = companyNode.InnerText; //中介公司
    // //string telPath = "//*[@class='jjr-side']";
    // //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    // //if (telNode == null)
    // //{
    // // continue;
    // //}
    // //string telstr = telNode.InnerText.Trim();
    // //house.Mobile = telstr; //经纪人电话
    // //house.CityCode = citycode; //城市代号
    // //house.CreateTime = DateTime.Now;
    // //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    // //if (flag == null)
    // //{
    // // houseList.Add(house);
    // //}
    // }
    //}
    #endregion
    catch (Exception ex)
    {
    log.ErrorAsync("服务器异常,异常信息:" + ex.Message);
    }

    重来没有接触过,零基础学习软件编程,一个字累
  • 相关阅读:
    LeetCode 189. Rotate Array
    LeetCode 965. Univalued Binary Tree
    LeetCode 111. Minimum Depth of Binary Tree
    LeetCode 104. Maximum Depth of Binary Tree
    Windows下MySQL的安装与配置
    LeetCode 58. Length of Last Word
    LeetCode 41. First Missing Positive
    LeetCode 283. Move Zeroes
    《蚂蚁金服11.11:支付宝和蚂蚁花呗的技术架构及实践》读后感
    删除docker下的镜像
  • 原文地址:https://www.cnblogs.com/hsha/p/8183442.html
Copyright © 2011-2022 走看看