zoukankan      html  css  js  c++  java
  • Spider -- MySQL数据库 之 增量爬取

    把电影天堂数据存入MySQL数据库 - 增量爬取

    # 思路
    # 1、MySQL中新建表 urltab,存储所有爬取过的链接的指纹
    # 2、在爬取之前,先判断该指纹是否爬取过,如果爬取过,则不再继续爬取

    1、建库建表

    # 建库建表
    create database filmskydb charset utf8;
    use filmskydb;
    create table request_finger( finger char(
    32) )charset=utf8; create table filmtab( name varchar(200), download varchar(500) )charset=utf8;

     

    2、完整代码

    from urllib import request
    import re
    from useragents import ua_list  # 自己个人写的模块,提供随机User-Agent
    import time
    import random
    import pymysql
    from hashlib import md5
    ​
    ​
    class FilmSkySpider(object):
        def __init__(self):
            # 一级页面url地址
            self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
            self.db = pymysql.connect('localhost', 'root', '123456', 'filmskydb', charset='utf8')
            self.cursor = self.db.cursor()
    ​
        # 获取html功能函数
        def get_html(self, url):
            headers = {
                'User-Agent': random.choice(ua_list)
            }
            req = request.Request(url=url, headers=headers)
            res = request.urlopen(req)
            # 通过网站查看网页源码,查看网站charset='gb2312'
            # 如果遇到解码错误,识别不了一些字符,则 ignore 忽略掉
            html = res.read().decode('gb2312', 'ignore')
    ​
            return html
    ​
        # 正则解析功能函数
        def re_func(self, re_bds, html):
            pattern = re.compile(re_bds, re.S)
            r_list = pattern.findall(html)
    ​
            return r_list
    ​
        # 获取数据函数
        def parse_page(self, one_url):
            html = self.get_html(one_url)
            re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">.*?</table>'
            # one_page_list: ['/html/xxx','/html/xxx','/html/xxx']
            one_page_list = self.re_func(re_bds, html)
    ​
            for href in one_page_list:
                two_url = 'https://www.dytt8.net' + href
                # 生成指纹 - md5加密
                s = md5()
                s.update(two_url.encode())
                two_url_md5 = s.hexdigest()
                # 判断链接是否需要抓取
                if self.is_go_on(two_url_md5):
                    self.parse_two_page(two_url)
                    # 爬取完成此链接后将指纹放到数据库表中
                    ins = 'insert into request_finger values(%s)'
                    self.cursor.execute(ins, [two_url_md5])
                    self.db.commit()
                    # uniform: 浮点数,爬取1个电影信息后sleep
                    time.sleep(random.uniform(1, 3))
    ​
    ​
        def is_go_on(self, two_url_md5):
            # 爬取之前先到数据库中查询比对
            sel = 'select finger from request_finger where finger=%s'
            # 开始抓取之前,先来判断该链接之前是否抓取过
            result = self.cursor.execute(sel, [two_url_md5])
            if not result:
                return True
    ​
    ​
        # 解析二级页面数据
        def parse_two_page(self, two_url):
            item = {}
            html = self.get_html(two_url)
            re_bds = r'<div class="title_all"><h1><font color=#07519a>(.*?)</font></h1></div>.*?<td style="WORD-WRAP.*?>.*?>(.*?)</a>'
            # two_page_list: [('名称1','ftp://xxxx.mkv')]
            two_page_list = self.re_func(re_bds, html)
    ​
            item['name'] = two_page_list[0][0].strip()
            item['download'] = two_page_list[0][1].strip()
    ​
            ins = 'insert into filmtab values(%s,%s)'
            film_list = [
                item['name'], item['download']
            ]
            self.cursor.execute(ins, film_list)
            self.db.commit()
            print(film_list)
    ​
    ​
        def main(self):
            for page in range(1, 201):
                one_url = self.url.format(page)
                self.parse_page(one_url)
                # uniform: 浮点数
                time.sleep(random.uniform(1, 3))
    ​
    ​
    if __name__ == '__main__':
        spider = FilmSkySpider()
        spider.main()

     

  • 相关阅读:
    背水一战 Windows 10 (26)
    背水一战 Windows 10 (25)
    背水一战 Windows 10 (24)
    背水一战 Windows 10 (23)
    背水一战 Windows 10 (22)
    背水一战 Windows 10 (21)
    背水一战 Windows 10 (20)
    背水一战 Windows 10 (19)
    背水一战 Windows 10 (18)
    背水一战 Windows 10 (17)
  • 原文地址:https://www.cnblogs.com/gengyufei/p/12643368.html
Copyright © 2011-2022 走看看