zoukankan      html  css  js  c++  java
  • 正则应用之数据采集房屋网站信息

    import re
    import json
    from urllib.request import urlopen
    import ssl
    # ⼲掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'
    
    def get_html_content(url):
        html=urlopen(url)
        content=html.read().decode('utf-8')
        # print(content)
        return content
    def chuli(content):
        obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
        it=obj.finditer(content)
        for el in it:
            yield {
                '价格:':el.group('price')+'',
                '房屋信息:':el.group('title'),
                '平米数:':el.group('pingmi'),
                '朝向':el.group('fangxiang'),
                '装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
                '房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
            }
    def xieru(jieguo):
        txt=json.dumps(jieguo,ensure_ascii=False)
        with open('houseInfo',mode='a',encoding='utf-8')as f:
            f.write(txt+'
    ')
    
    def main():
        for i in range(1,101):
            if i ==1:
                new_content = get_html_content(ershoufang_url)
            else:
                dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
                new_content = get_html_content(dong_url)
            ret = chuli(new_content)
            for el in ret:
                xieru(el)
                print(el)
    
    if __name__=='__main__':
        main()
  • 相关阅读:
    [zoj3627]模拟吧
    [zoj3623]背包模型
    [hdu4358]树状数组
    [hdu1272]并查集
    [hdu3308]线段树
    [hdu5033]单调队列
    [hdu1506]单调队列(栈)
    [hdu2888]二维RMQ
    [hdu4123]dfs区间化+RMQ
    [hdu1242]优先队列
  • 原文地址:https://www.cnblogs.com/PythonMrChu/p/9785661.html
Copyright © 2011-2022 走看看