zoukankan      html  css  js  c++  java
  • Python Spider 抓取猫眼电影TOP100

    """
    抓取猫眼电影TOP100
    """
    
    import re
    import time
    import requests
    
    from bs4 import BeautifulSoup
    
    
    class SpiderMaoyan(object):
    
        def __init__(self):
            # 通过分析URL可以发现, 猫眼电影TOP100页面是通过 offset + 10 来分页的
            self.url = "http://maoyan.com/board/4?offset={0}"
            # 设置一下UA, 否则有可能提示你访问被禁止了
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/66.0.3359.139 Safari/537.36"
            }
            # 定义需要提取的内容字段
            self.fields = ("id", "name", "movieUrl", "imgUrl", "star", "releaseTime", "score")
    
        def handler(self, offset=0):
    
            while offset < 100:
                response = requests.get(self.url.format(offset), headers=self.headers)
                if response.status_code == 200:
                    print("INFO -> Current URL: <%s>" % response.url)
                    # 编码处理一下, 不然有可能中文显示乱码
                    r_html = response.text.encode(response.encoding).decode("utf-8")
                    # 构建一个 BeautifulSoup 对象, 用于后续的标签、内容提取
                    soup = BeautifulSoup(r_html, "html5lib")
                    # 继续分析网页源代码, 发现每部影片都存在 <dd></dd> 标签中
                    tags = soup.select("dd")
                    # 提取内容
                    for tag in tags:
                        # id、name、movieUrl
                        obj = tag.find("p", class_="name").select_one("a")
                        _id = re.search(r"movieId:(d+)", obj.get("data-val")).group(1)
                        _name = obj.string
                        _movieUrl = "http://maoyan.com" + obj.get("href")
                        # img
                        # Tips: 可以将图片地址后面的分辨率去掉, 保存高清大图地址 .split("@")[0]
                        _imgUrl = tag.find("img", class_="board-img").get("data-src")
                        # star
                        # Tips: 中文标点
                        _star = tag.find("p", class_="star").string.strip().split("")[-1]
                        # releaseTime
                        # Tips: 中文标点
                        _releaseTime = tag.find("p", class_="releasetime").string.split("")[-1]
                        # score
                        _score = tag.find("p", class_="score").get_text()
    
                        # 接下来就可以将数据写入存储了
                        # Tips: 这种 SQL 生成方式有必要验证 key/val 是否成对出现
                        print(
                            "INSERT INTO TABLE_NAME (%s) VALUE %s;" % (
                                ", ".join(self.fields), tuple([_id, _name, _movieUrl, _imgUrl, _star, _releaseTime, _score])
                            )
                        )
    
                    # 偏移量自增
                    offset += 10
                    # 有必要停顿一下
                    time.sleep(.9)
                else:
                    print(response.reason)
                    exit(999)
    
    
    if __name__ == "__main__":
    
        spider = SpiderMaoyan()
        spider.handler()
  • 相关阅读:
    Byobu(tmux)的使用与定制
    Centos 编译安装Python 2.6
    how to set up the remote accout who to connection and management mysql server
    CentOS安装python2.6以及MySQLpython
    byobu "屏风" 简介 ,以及在CentOS下安装及日常使用
    /bin/rm: cannot remove `libtoolT': No such file or directory
    MySQLpython EnvironmentError: mysql_config not found | InfoEntropy
    几个逼真的页面 watch out
    Emerge详细解释
    微软Internet TV初体验
  • 原文地址:https://www.cnblogs.com/wangxiaoqiangs/p/8998298.html
Copyright © 2011-2022 走看看