zoukankan      html  css  js  c++  java
  • 正则应用之数据采集房屋网站信息

    import re
    import json
    from urllib.request import urlopen
    import ssl
    # ⼲掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'
    
    def get_html_content(url):
        html=urlopen(url)
        content=html.read().decode('utf-8')
        # print(content)
        return content
    def chuli(content):
        obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
        it=obj.finditer(content)
        for el in it:
            yield {
                '价格:':el.group('price')+'',
                '房屋信息:':el.group('title'),
                '平米数:':el.group('pingmi'),
                '朝向':el.group('fangxiang'),
                '装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
                '房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
            }
    def xieru(jieguo):
        txt=json.dumps(jieguo,ensure_ascii=False)
        with open('houseInfo',mode='a',encoding='utf-8')as f:
            f.write(txt+'
    ')
    
    def main():
        for i in range(1,101):
            if i ==1:
                new_content = get_html_content(ershoufang_url)
            else:
                dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
                new_content = get_html_content(dong_url)
            ret = chuli(new_content)
            for el in ret:
                xieru(el)
                print(el)
    
    if __name__=='__main__':
        main()
  • 相关阅读:
    sqlserver2005新特性介绍
    Sql 数据库 用户密码MD5加密
    easyui datagrid
    JQ js 对数组的操作
    c#2.0锐利体验《泛型编程》读书笔记
    jQuery EasyUI DataGrid Checkbox 数据设定与取值
    数据库的常用命令
    关于background-image设置背景图片
    Css背景设置 、
    实时监听输入框值变化的完美方案:oninput & onpropertychange
  • 原文地址:https://www.cnblogs.com/PythonMrChu/p/9785661.html
Copyright © 2011-2022 走看看