zoukankan      html  css  js  c++  java
  • 照葫芦画瓢之爬虫豆瓣top100

    import requests
    import re
    import json
    from requests.exceptions import RequestException


    def get(url):
        try:
            headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
            }
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    def parse(html):
        patter = re.compile('<li.*?cover.*?href="(.*?)"stitle="(.*?)">.*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)
        items = re.findall(patter,html)
        for i in items:
            yield {
                'url': i[0],
                'title': i[1],
                'name': i[2].strip(),
                'date': i[3].strip(),
                'pulisher': i[4].strip()
            }
    def  write_to_file (content):
        with open('result.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=0)+' ')
            f.close()
    def main():
        url = 'https://book.douban.com/'
        html = get(url)
        for i in parse(html):
            print(i)
            write_to_file(i)


    if __name__ == '__main__':
        main()

  • 相关阅读:
    css样式详解
    数组去重的两种方式
    react 中class样式的书写过程及注意点
    react 行内样式几种写法
    React创建组件的两种方法
    React中class创建组件和function创建组件的区别
    class关键字
    关于webpack配置环境,以及自己搭建react环境
    经典圣杯双飞翼布局
    关于vue中深拷贝的惨痛教训
  • 原文地址:https://www.cnblogs.com/MisterZZL/p/9534307.html
Copyright © 2011-2022 走看看