zoukankan      html  css  js  c++  java
  • 照葫芦画瓢之爬虫豆瓣top100

    import requests
    import re
    import json
    from requests.exceptions import RequestException


    def get(url):
        try:
            headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
            }
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    def parse(html):
        patter = re.compile('<li.*?cover.*?href="(.*?)"stitle="(.*?)">.*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?</li>',re.S)
        items = re.findall(patter,html)
        for i in items:
            yield {
                'url': i[0],
                'title': i[1],
                'name': i[2].strip(),
                'date': i[3].strip(),
                'pulisher': i[4].strip()
            }
    def  write_to_file (content):
        with open('result.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=0)+' ')
            f.close()
    def main():
        url = 'https://book.douban.com/'
        html = get(url)
        for i in parse(html):
            print(i)
            write_to_file(i)


    if __name__ == '__main__':
        main()

  • 相关阅读:
    ADO.NET的记忆碎片(四)
    ADO.NET的记忆碎片(八)
    卡特兰数 应用
    hdu 1249 三角形
    hdu 1143
    nyist 93 汉诺塔(三)
    hdu 1123 Train Problem II
    hdu 1133 Buy the Ticket
    hdu 1022 Train Problem I
    nyist 610 定长覆盖
  • 原文地址:https://www.cnblogs.com/MisterZZL/p/9534307.html
Copyright © 2011-2022 走看看