zoukankan      html  css  js  c++  java
  • python3+requests+BeautifulSoup+mysql爬取豆瓣电影top250

      基础页面:https://movie.douban.com/top250

      代码:

    from time import sleep
    from requests import get
    from bs4 import BeautifulSoup
    import re
    import pymysql
    
    db = pymysql.connect(host='localhost',
                         user='root',
                         password='123456',
                         db='douban',
                         charset='utf8mb4',
                         cursorclass=pymysql.cursors.DictCursor
                         )
    try:
        with db.cursor() as cursor:
            sql = "CREATE TABLE IF NOT EXISTS `top250` (" 
                "`id` int(6) NOT NULL AUTO_INCREMENT," 
                "`top` int(6) NOT NULL," 
                "`page-code` int(6) NOT NULL," 
                "`title` varchar(255) NOT NULL," 
                "`origin-title` varchar(255)," 
                "`score` float NOT NULL," 
                "`theme` varchar(255) NOT NULL," 
                "PRIMARY KEY(`id`)" 
                ") ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;"
            cursor.execute(sql,)
    finally:
        db.commit()
    
    base_url = 'https://movie.douban.com/top250'
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'xxx',
        'Host': 'movie.douban.com',
        'Referer': 'https://movie.douban.com/chart',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'xxx'
    }
    
    
    def crawler(url=None, headers=None, delay=1):
        r = get(url=url, headers=headers, timeout=3)
        soup = BeautifulSoup(r.text, 'html.parser')
        page_tag = soup.find('span', attrs={'class': 'thispage'})
        page_code = re.compile(r'<span class="thispage">(.*)</').findall(str(page_tag))[0]
        movie_ranks = soup.find_all('em', attrs={'class': ''})
        movie_titles = soup.find_all('div', attrs={'class': 'hd'})
        movie_scores = soup.find_all('span', attrs={'class': 'rating_num'})
        movie_themes = soup.find_all('span', attrs={'class': 'inq'})
        next_page = soup.find('link', attrs={'rel': 'next'})
        for ranks, titles, scores, themes in zip(movie_ranks, movie_titles, movie_scores, movie_themes):
            rank = re.compile(r'<em class="">(.*)</').findall(str(ranks))
            regex_ts = re.compile(r'<span class="title">(.*)</').findall(str(titles))
            title = regex_ts[0]
            score = re.compile(r'<span class="rating_num" property="v:average">(.*)</').findall(str(scores))[0]
            theme = re.compile(r'<span class="inq">(.*)</').findall(str(themes))[0]
            try:
                origin_title = regex_ts[1]
                origin_title = re.compile(r'./.(.+)').findall(origin_title)[0]
                with db.cursor() as cursor:
                    sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `origin-title`, `score`, `theme`)" 
                          " VALUES (%s, %s, %s, %s, %s, %s)"
                    cursor.execute(sql, (rank, page_code, title, origin_title, score, theme,))
            except IndexError:
                with db.cursor() as cursor:
                    sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `score`, `theme`)" 
                          " VALUES (%s, %s, %s, %s, %s)"
                    cursor.execute(sql, (rank, page_code, title, score, theme,))
            finally:
                db.commit()
        if next_page is not None:
            headers['Referer'] = url
            next_url = base_url + re.compile(r'<link href="(.*)" rel="next">').findall(str(next_page))[0]
            sleep(delay)
            crawler(url=next_url, headers=headers, delay=3)
    
    
    crawler(base_url, header, 0)
    db.close()
    

      结果:

    mysql> select top,title,score from top250 where id = 175;
    +-----+--------+-------+
    | top | title  | score |
    +-----+--------+-------+
    | 176 | 罗生门 |   8.7 |
    +-----+--------+-------+
    1 row in set (0.00 sec)
    
    mysql> select top,title,page-code,score from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'page' in 'field list'
    mysql> select top,page-code,title,score from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'page' in 'field list'
    mysql> select page-code from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'page' in 'field list'
    mysql> describe top250
        -> ;
    +--------------+--------------+------+-----+---------+----------------+
    | Field        | Type         | Null | Key | Default | Extra          |
    +--------------+--------------+------+-----+---------+----------------+
    | id           | int(6)       | NO   | PRI | NULL    | auto_increment |
    | top          | int(6)       | NO   |     | NULL    |                |
    | page-code    | int(6)       | NO   |     | NULL    |                |
    | title        | varchar(255) | NO   |     | NULL    |                |
    | origin-title | varchar(255) | YES  |     | NULL    |                |
    | score        | float        | NO   |     | NULL    |                |
    | theme        | varchar(255) | NO   |     | NULL    |                |
    +--------------+--------------+------+-----+---------+----------------+
    7 rows in set (0.32 sec)
    
    mysql> select page-code from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'page' in 'field list'
    mysql> select origin-title from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'origin' in 'field list'
    mysql> select origin_title from top250 where id = 175;
    ERROR 1054 (42S22): Unknown column 'origin_title' in 'field list'
    mysql> select * from top250 where id = 175;
    +-----+-----+-----------+--------+--------------+-------+-------------------+
    | id  | top | page-code | title  | origin-title | score | theme             |
    +-----+-----+-----------+--------+--------------+-------+-------------------+
    | 175 | 176 |         8 | 罗生门 | 羅生門       |   8.7 | 人生的N种可能性。 |
    +-----+-----+-----------+--------+--------------+-------+-------------------+
    1 row in set (0.00 sec)
    
    mysql> select * from top250 where title = 未麻的部屋;
    ERROR 1054 (42S22): Unknown column '未麻的部屋' in 'where clause'
    mysql> select * from top250 where top=175;
    Empty set (0.00 sec)
    
    mysql>

      两个小问题:

      1.没想到数据库字段不能用'-'...,于是page-code字段与origin-title字段不能独立进行查找。。。

      2.不知道为啥top175的电影《未麻的部屋》没爬到。。。

      建议使用scrapy。

      用scrapy的一些好处是配置爬虫很方便,还有其内部自带的html解析器、对不完整的url的组建等十分便利。

      最后,吐槽一下,之前的电脑配置太差,跑深度学习程序的过程耗尽内存,出现莫名的bug后,蓝屏死机就再也没法启动了。。。所以,暂时不能更新博客了。。。

  • 相关阅读:
    博弈论(SG函数):HNOI 2007 分裂游戏
    博弈论(二分图匹配):NOI 2011 兔兔与蛋蛋游戏
    博弈论(男人八题):POJ 1740 A New Stone Game
    动态规划(树形DP):HDU 5834 Magic boy Bi Luo with his excited tree
    杂项(最小表示法):HZOI 2015 Glass Beads
    如何避免死锁
    死锁的四个必要条件
    线程安全和可重入函数之间的区别和联系
    信号量 sem_undo设置
    linux管道的容量和内部组织方式
  • 原文地址:https://www.cnblogs.com/darkchii/p/10003876.html
Copyright © 2011-2022 走看看