zoukankan      html  css  js  c++  java
  • 正则应用之数据采集房屋网站信息

    import re
    import json
    from urllib.request import urlopen
    import ssl
    # ⼲掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'
    
    def get_html_content(url):
        html=urlopen(url)
        content=html.read().decode('utf-8')
        # print(content)
        return content
    def chuli(content):
        obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
        it=obj.finditer(content)
        for el in it:
            yield {
                '价格:':el.group('price')+'',
                '房屋信息:':el.group('title'),
                '平米数:':el.group('pingmi'),
                '朝向':el.group('fangxiang'),
                '装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
                '房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
            }
    def xieru(jieguo):
        txt=json.dumps(jieguo,ensure_ascii=False)
        with open('houseInfo',mode='a',encoding='utf-8')as f:
            f.write(txt+'
    ')
    
    def main():
        for i in range(1,101):
            if i ==1:
                new_content = get_html_content(ershoufang_url)
            else:
                dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
                new_content = get_html_content(dong_url)
            ret = chuli(new_content)
            for el in ret:
                xieru(el)
                print(el)
    
    if __name__=='__main__':
        main()
  • 相关阅读:
    SQL注入的一般步骤及防范方法
    防止SQL注入的五种方法
    document.getElementById("orderform").submit() 提交给了谁?
    页面调试-F12
    rs.last()续
    rs.last()
    14课后习题
    HashMap
    链表
    习题
  • 原文地址:https://www.cnblogs.com/PythonMrChu/p/9785661.html
Copyright © 2011-2022 走看看