zoukankan      html  css  js  c++  java
  • 爬取豆瓣 Top250书籍

    '''爬取豆瓣top250书籍'''
    import requests
    import json
    import csv
    from bs4 import BeautifulSoup
    
    books = []
    
    
    def book_name(url):
        headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        }
        res = requests.get(url=url, headers=headers)
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find('div',attrs={'class':'grid-16-8 clearfix'}).find('div',attrs={'class':'indent'}).find_all('table')
    
        for i in items:
            book = []
            title = i.find('div',attrs={'class':'pl2'}).find('a')
            book.append('' + title.text.replace('
    ', '').strip() + '')
    
            star = i.find(class_="star clearfix").find(class_="rating_nums")
            book.append(star.text + '')
    
            try:
                brief = i.find(class_="quote").find(class_="inq")
            except AttributeError:
                book.append("~暂无简介~!")
            else:
                book.append(brief.text)
                link = i.find(class_="pl2").find('a')['href']
                book.append(link)
    
            global books
            books.append(book)
    
            print(book)
    
        try:
            next = soup.find(class_="paginator").find(
                class_="next").find('a')['href']
        # 翻到最后一页
        except TypeError:
            return 0
        else:
            return next
    
    next = 'https://book.douban.com/top250?start=0&filter='
    count = 0
    
    while next != 0:
        count += 1
        next = book_name(next)
        print('-----------以上是第'+str(count)+'页的内容-----------')
    
    csv_file=open('top250_books.csv','w',newline='',encoding='utf-8')
    w=csv.writer(csv_file)
    w.writerow(['书名','评分','简介','链接'])
    for b in books:
        w.writerow(b)

     

  • 相关阅读:
    uva 12426 Counting Triangles 计算几何
    poj 1195 Mobile phones 二维树状数组
    poj 1039 Pipe 计算几何
    poj 3580 SuperMemo 数据结构
    poj 1031 Fence 计算几何
    ArcEngine 无法嵌入互操作类型
    IDL 读取显示HDF文件
    Sql Server 2005 Com+ 警告处理办法
    C# 自定义控件开发
    ArcEngine 获取HDF文件中的子文件
  • 原文地址:https://www.cnblogs.com/memory-ccy/p/13225055.html
Copyright © 2011-2022 走看看