zoukankan      html  css  js  c++  java
  • 对于房天下租房信息进行爬取

    对于房天下租房信息进行爬取

    代码

    import re
    
    import requests
    from lxml.html import etree
    
    url_xpath = '//dd/p[1]/a[1]/@href'
    title_xpath = '//dd/p[1]/a[1]/@title'
    data_xpaht = '//dd/p[2]/text()'
    headers = {
        'rpferpr': 'https://sh.zu.fang.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
    }
    rp = requests.get('https://sh.zu.fang.com/', headers=headers)
    rp.encoding = rp.apparent_encoding
    html = etree.HTML(rp.text)
    url = html.xpath(url_xpath)
    title = html.xpath(title_xpath)
    data = re.findall('<p class="font15 mt12 bold">(.*?)</p>', rp.text, re.S)
    mold_lis = []
    house_type_lis = []
    area_lis = []
    for a in data:
        a = re.sub('�O', '平方米', a)
        mold = re.findall('
    s.*?(S.*?)<span class="splitline">', a)
        house_type_area = re.findall('</span>(.*?)<span class="splitline">', a)
        try:
            mold_lis.append(mold[0])
            house_type_lis.append(house_type_area[0])
            area_lis.append(house_type_area[1])
        except:
            pass
    
    data_zip = zip(title, url, mold_lis, house_type_lis, area_lis)
    
    with open('info.txt', 'a', encoding='utf8') as fa:
        for a in data_zip:
            fa.write(str(a))
            fa.write('
    ')
    

    未完待续

    后续接着对于分区进行爬取

    arpa_dict = {
        '不限':'house',
        '浦东':'house-a025',
        '嘉定':'house-a029',
        '宝山':'house-a030',
        '闵行':'house-a018',
        '松江':'house-a0586',
        '普陀':'house-a028',
        '静安':'house-a021',
        '黄浦':'house-a024',
        '虹口':'house-a024',
        '青浦':'house-a024',
        '奉贤':'house-a024',
        '金山':'house-a024',
        '杨浦':'house-a024',
        '徐汇':'house-a024',
        '长宁':'house-a024',
        '崇明':'house-a0996',
        '上海周边':'house-a01046',
    }
    
  • 相关阅读:
    【转载】常考算法模板
    NOIP2020微信步数暴力80分
    NOIP2020移球游戏快速排序满分程序
    第一场NOI Online能力测试入门组B跑步
    【转】STL之Set——插入元素、二分查找元素(log级别)
    [转载]图论500题
    差分约束系统简单介绍(入门)
    辗转相除法的证明
    并查集2个优化——按秩合并和路径压缩
    递推算法之平面分割问题总结
  • 原文地址:https://www.cnblogs.com/pythonywy/p/11259941.html
Copyright © 2011-2022 走看看