zoukankan      html  css  js  c++  java
  • python 爬取链家

    import json
    
    import  requests
    from lxml import etree
    from time import sleep
    
    
    url = "https://sz.lianjia.com/ershoufang/rs/"
    headers = {
        "User-Agent":"",
        "Refer":"https://sz.lianjia.com/ershoufang/pg2/"
    }
    resp = requests.get(url,headers=headers)
    
    base_url = "https://sz.lianjia.com/ershoufang/pg{}/"
    html = etree.HTML(resp.text)
    data = html.xpath('//*[@id="content"]//div[@class="page-box fr"]/div/@page-data')
    data = json.loads(data[0])
    totalPage = data['totalPage']
    curPage = data['curPage']
    
    def get_data(url):
        list = []
        resp = requests.get(url, headers=headers)
        html = etree.HTML(resp.text)
        ul = html.xpath('.//ul[@class="sellListContent"]/li')
        for li in ul:
            face = li.xpath('./a/img/@src')
            title = li.xpath('.//div[@class="title"]/a/text()')
            position = li.xpath('.//div[@class="positionInfo"]/a/text()')
            house_info = li.xpath('.//div[@class="houseInfo"]/text()')
            follow_info = li.xpath('.//div[@class="followInfo"]/text()')
            price = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')
            unit_price = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')
            tag = li.xpath('.//div[@class="tag"]//span/text()')
            content = {}
            content["face"] = face[0]
            content["title"] = title[0]
            content["position"] = position[0]
            content["house_info"] = house_info[0]
            content["follow_info"] = follow_info[0]
            content["price"] = price[0]
            content["unit_price"] = unit_price[0]
            if len(tag) >=1 and  tag[0] is not None:
                content['tag'] = tag[0]
            list.append(content)
        return list
    
    totalList = []
    for i in range(1,totalPage+1):
        url = base_url.format(i)
        print("crawl url  " + url)
        cur_list = get_data(url)
        print(cur_list)
        totalList = totalList + cur_list
    
    url = base_url.format(1)
    
    print(totalList)

  • 相关阅读:
    android 解密工具
    android打包需要的图标
    Mac 创建软链接
    历届试题 Excel地址
    算法训练 字串统计
    最长回文子串
    算法提高 P1001【大数乘法】
    算法提高 拿糖果【埃氏筛 动态规划】
    算法训练 未名湖边的烦恼
    算法提高 合并石子【动态规划】
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12491105.html
Copyright © 2011-2022 走看看