zoukankan      html  css  js  c++  java
  • 怎么写爬虫,怎么找网站练手抓取链家、中原、安居客、我爱我家,今年5月份开始写论文啦!!!

    //设置请求时间

    string html = string.Empty;
    try
    {
    HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
    request.Timeout = 30 * 1000;//设置30s的超时
    request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
    request.ContentType = "text/html; charset=utf-8";
    using (HttpWebResponse response = request.GetResponse() as HttpWebResponse) //发起请求
    {
    if (response.StatusCode != HttpStatusCode.OK)
    {
    log.Error("抓取{0}地址返回失败,响应状态为{1}", url, response.StatusCode);
    }
    else
    {
    try
    {
    StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
    html = sr.ReadToEnd();//读取数据
    sr.Close();
    }
    catch (Exception ex)
    {
    log.Error("抓取{0}失败", url, ex);
    html = null;
    }
    }
    }
    }
    catch (Exception ex)
    {
    log.Error("抓取{0}出现异常", url, ex);
    html = null;
    }
    return html;

    //抓取链家、中原、安居客、我爱我家

    string html = HttpHelper.DownloadUrl(pageurl);
    if (html == null)
    {
    return houseList;
    }
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    //链家
    string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
    HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    if (noneNodeList == null)
    {
    log.ErrorAsync("数据为空!");
    return houseList;
    }
    foreach (var item in noneNodeList)
    {
    TrojanHorse house = new TrojanHorse();
    HtmlDocument docChild = new HtmlDocument();
    docChild.LoadHtml(item.OuterHtml);
    //链家
    string urlPath = @"//*[@class='info']/div[@class='title']/a";
    HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    string tsct = urlNode.InnerText;//小区名称
    //链家 [@class='xiaoquListItemRight']/div
    string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
    HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    string s = "";
    if (urlNodes == null)
    {
    s = "null";
    }
    else
    {
    s = urlNodes.InnerText;
    }
    //链家
    string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    string st = "";
    if (urlNodest == null)
    {
    st = "null";
    }
    else
    {
    st = urlNodest.InnerText;
    }
    #region
    //string tscts = s.Replace(" ", "");
    //string tsctst = tscts.Substring(0, 8);
    //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // string rsf = s;
    //string zf = tsctsb.Substring(0, 5);//租房
    #endregion
    house.title = tsct;
    house.price = s;
    house.remark = st;
    houseList.Add(house);
    #region
    //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    //if (companyNode == null)
    //{
    // continue;
    //}
    //house.Company = companyNode.InnerText; //中介公司
    //string telPath = "//*[@class='jjr-side']";
    //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    //if (telNode == null)
    //{
    // continue;
    //}
    //string telstr = telNode.InnerText.Trim();
    //house.Mobile = telstr; //经纪人电话
    //house.CityCode = citycode; //城市代号
    //house.CreateTime = DateTime.Now;
    //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    //if (flag == null)
    //{
    // houseList.Add(house);
    //}
    #endregion
    }
    }

    /// <summary>
    /// 抓取每一页的数据
    /// </summary>
    /// <param name="pageurl"></param>
    /// <returns></returns>
    private static List<TrojanHorse> GetTrojanHorseList(string pageurl)
    {
    List<TrojanHorse> houseList = new List<TrojanHorse>();
    try
    {
    string html = HttpHelper.DownloadUrl(pageurl);
    if (html == null)
    {
    return houseList;
    }
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    //链家
    string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
    HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    if (noneNodeList == null)
    {
    log.ErrorAsync("数据为空!");
    return houseList;
    }
    foreach (var item in noneNodeList)
    {
    TrojanHorse house = new TrojanHorse();
    HtmlDocument docChild = new HtmlDocument();
    docChild.LoadHtml(item.OuterHtml);
    //链家
    string urlPath = @"//*[@class='info']/div[@class='title']/a";
    HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    string tsct = urlNode.InnerText;//小区名称
    //链家 [@class='xiaoquListItemRight']/div
    string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
    HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    string s = "";
    if (urlNodes == null)
    {
    s = "null";
    }
    else
    {
    s = urlNodes.InnerText;
    }
    //链家
    string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    string st = "";
    if (urlNodest == null)
    {
    st = "null";
    }
    else
    {
    st = urlNodest.InnerText;
    }
    #region
    //string tscts = s.Replace(" ", "");
    //string tsctst = tscts.Substring(0, 8);
    //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // string rsf = s;
    //string zf = tsctsb.Substring(0, 5);//租房
    #endregion
    house.title = tsct;
    house.price = s;
    house.remark = st;
    houseList.Add(house);
    #region
    //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    //if (companyNode == null)
    //{
    // continue;
    //}
    //house.Company = companyNode.InnerText; //中介公司
    //string telPath = "//*[@class='jjr-side']";
    //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    //if (telNode == null)
    //{
    // continue;
    //}
    //string telstr = telNode.InnerText.Trim();
    //house.Mobile = telstr; //经纪人电话
    //house.CityCode = citycode; //城市代号
    //house.CreateTime = DateTime.Now;
    //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    //if (flag == null)
    //{
    // houseList.Add(house);
    //}
    #endregion
    }
    }


    #region
    // {
    // string html = HttpHelper.DownloadUrl(pageurl);
    // if (html == null)
    // {
    // return houseList;
    // }
    // HtmlDocument doc = new HtmlDocument();
    // doc.LoadHtml(html);
    // //*[@id="pebpwbwege"]
    // //链家
    // //string psht = @"//*[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";

    // //中原
    // string psht = @"//*[@class='section-wrap section-houselists']/div[@class='section']/div[@class='house-item clearfix']";
    // HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
    // if (noneNodeList == null)
    // {
    // log.ErrorAsync("数据为空!");
    // return houseList;
    // }
    // foreach (var item in noneNodeList)
    // {
    // TrojanHorse house = new TrojanHorse();
    // HtmlDocument docChild = new HtmlDocument();
    // docChild.LoadHtml(item.OuterHtml);
    // //docChild.LoadHtml();
    // //链家
    // //string urlPath = @"//*[@class='info']/div[@class='title']/a";
    // //中原
    // string urlPath = @"//*[@class='item-info fl']/h4/a";
    // HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
    // //docChild.DocumentNode.SelectSingleNode(str);
    // string tsct = urlNode.InnerText;//小区名称
    // //链家
    // //string strs = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";

    // //中原
    // string strs = @"//*[@class='item-pricearea fr']/p[@class='tc f666 f12 mt_10']/a";
    // HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
    // string s = "";
    // if (urlNodes == null)
    // {
    // s = "null";
    // }
    // else
    // {
    // s = urlNodes.InnerText;

    // }
    // //链家
    // //string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
    // //中原
    // string strst = @"//*[@class='item-pricearea fr']/p[@class='price-nub cRed tc']/span";
    // HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
    // string st = "";
    // if (urlNodest == null)
    // {
    // st = "null";
    // }
    // else
    // {
    // st = urlNodest.InnerText;

    // }

    // //string tscts = s.Replace(" ", "");
    // //string tsctst = tscts.Substring(0, 8);
    // //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
    // // string rsf = s;
    // //string zf = tsctsb.Substring(0, 5);//租房
    // house.title = tsct;
    // house.price = s;
    // house.remark = st;
    // houseList.Add(house);

    // //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
    // //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
    // //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
    // //if (companyNode == null)
    // //{
    // // continue;
    // //}
    // //house.Company = companyNode.InnerText; //中介公司
    // //string telPath = "//*[@class='jjr-side']";
    // //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
    // //if (telNode == null)
    // //{
    // // continue;
    // //}
    // //string telstr = telNode.InnerText.Trim();
    // //house.Mobile = telstr; //经纪人电话
    // //house.CityCode = citycode; //城市代号
    // //house.CreateTime = DateTime.Now;
    // //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
    // //if (flag == null)
    // //{
    // // houseList.Add(house);
    // //}
    // }
    //}
    #endregion
    catch (Exception ex)
    {
    log.ErrorAsync("服务器异常,异常信息:" + ex.Message);
    }

    重来没有接触过,零基础学习软件编程,一个字累
  • 相关阅读:
    struts.xml配置详解 内部资料 请勿转载 谢谢合作
    Struts框架2ActionError类 内部资料 请勿转载 谢谢合作
    Struts框架 内部资料 请勿转载 谢谢合作
    JDBC 内部资料 请勿转载 谢谢合作
    JSP 实现◆菱形 三角形△ 的输出
    JSP实现 乘法口诀输出
    GUID 全局统一标识符的介绍
    Java 基础 Map 练习题
    Java 异常处理 练习2
    Java 异常处理
  • 原文地址:https://www.cnblogs.com/hsha/p/8183442.html
Copyright © 2011-2022 走看看