zoukankan      html  css  js  c++  java
  • 正则应用之数据采集房屋网站信息

    import re
    import json
    from urllib.request import urlopen
    import ssl
    # ⼲掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'
    
    def get_html_content(url):
        html=urlopen(url)
        content=html.read().decode('utf-8')
        # print(content)
        return content
    def chuli(content):
        obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
        it=obj.finditer(content)
        for el in it:
            yield {
                '价格:':el.group('price')+'',
                '房屋信息:':el.group('title'),
                '平米数:':el.group('pingmi'),
                '朝向':el.group('fangxiang'),
                '装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
                '房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
            }
    def xieru(jieguo):
        txt=json.dumps(jieguo,ensure_ascii=False)
        with open('houseInfo',mode='a',encoding='utf-8')as f:
            f.write(txt+'
    ')
    
    def main():
        for i in range(1,101):
            if i ==1:
                new_content = get_html_content(ershoufang_url)
            else:
                dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
                new_content = get_html_content(dong_url)
            ret = chuli(new_content)
            for el in ret:
                xieru(el)
                print(el)
    
    if __name__=='__main__':
        main()
  • 相关阅读:
    (IOCP)-C#高性能Socket服务器的实现
    GraphQL和RESTful的区别
    HTTP Client Performance Improvements
    foobar2000 iOS使用,并连接PC的歌曲进行播放
    Spring中基于AOP的@AspectJ
    Spring中基于AOP的XML架构
    Spring框架的AOP
    Spring的AOP AspectJ切入点语法详解(转)
    Spring中实现自定义事件
    Spring的事件处理
  • 原文地址:https://www.cnblogs.com/PythonMrChu/p/9785661.html
Copyright © 2011-2022 走看看