zoukankan      html  css  js  c++  java
  • 正则匹配的爬虫

    import requests
    import re
    class Anjuke(object):
        def __init__(self):
            self.url = "https://beijing.anjuke.com/sale/huairou/o5/"
            self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"}
            self.pattern = re.compile('<ul id="houselist-mod-new" class="houselist-mod houselist-mod-new">(.*?)</ul>',re.S)
            self.second_pattern = re.compile('<(.*?)>|&(.*?);|s')


        def send_request(self):
            reponse = requests.get(self.url, headers=self.headers)
            data = reponse.content.decode()
            print(data)
            return data

        def save_data(self,result_data):
            with open('anjuke.text','a') as f:
                for data in result_data:
                    second_content = self.second_pattern.sub('', data) + ' '
                    f.write(second_content)
        def analysis_data(self,data):
            result_list = self.pattern.findall(data)
            return result_list

        def run(self):
            data = self.send_request()
            result_list = self.analysis_data(data)
            print(result_list)
            self.save_data(result_list)

    if __name__ == '__main__':
        Anjuke().run()

























  • 相关阅读:
    自己开发网站全文检索系统
    中国摇滚二十年(经典100首歌曲)
    有一首歌
    Snoopy.class.php使用手册
    wp-Syntax 插件使用方法
    rabbitmq使用
    小程序相关功能的实现
    知识链接
    celery使用
    阿里云服务器部署项目注意事项
  • 原文地址:https://www.cnblogs.com/hanjian200ok/p/9463165.html
Copyright © 2011-2022 走看看