zoukankan      html  css  js  c++  java
  • 二级静态页面的爬取-----电影天堂

    '''二级静态页面的爬取'''
    from urllib import request
    import re
    import time
    import random
    import pymysql
    
    
    class DianyingtiantangSpider:
        def __init__(self):
            self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
            self.headers = {'User-Agent': random.choice([
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
                'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
            ])}
            self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dianyingdb',
                                      charset='utf8')
            self.cursor = self.db.cursor()
    
        # 获取html函数(因为两个页面都需要请求)
        def get_page(self, url):
            req = request.Request(url=url, headers=self.headers)
            res = request.urlopen(req)
            html = res.read().decode('gb2312', 'ignore')
            return html
    
        # 解析提取数据(把名称和下载链接一次性拿到)
        def parse_page(self, html):
            # 1.先解析一级页面(提取电影名称,和详情链接)
            pattern = re.compile('<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>',
                                 re.S)
            # film_list:[('详情链接','电影名称'),()]
            film_list = pattern.findall(html)
            # print(film_list)
            result_list = []
            for film in film_list:
                film_name = film[1].strip()
                film_link = 'https://www.dytt8.net{}'.format(film[0].strip())
                # print(film_link)
                download_link = self.parse_two_page(film_link)
    
                result_list.append([film_name, download_link])
            self.save_page(result_list)
    
        def parse_two_page(self, film_link):
            two_html = self.get_page(film_link)
            pattern = re.compile('<td style="WORD-WRAP.*?>.*?>(.*?)</a>', re.S)
            download_link = pattern.findall(two_html)
            # print('你猜',download_link)
            return download_link[0].strip()
    
            # 2.拿到详情链接后,再去获取详情链接的html,提取下载链接
    
        # 保存
        def save_page(self, result_list):
            ins = 'insert into film values(%s,%s)'
            self.cursor.executemany(ins, result_list)
            self.db.commit()
    
        # 主函数
        def main(self):
            ins = 'delete from film'
            self.cursor.execute(ins)
            self.db.commit()
            i = 1
            for i in range(1, 5):
                url = self.url.format(i)
                html = self.get_page(url)
                self.parse_page(html)
                print('第{}页爬取成功'.format(i))
                i += 1
                time.sleep(random.randint(1, 3))
            self.cursor.close()
            self.db.close()
    
    
    if __name__ == '__main__':
        start = time.time()
        spider = DianyingtiantangSpider()
        spider.main()
        end = time.time()
        print('程序执行时间为:%.2f' % (end - start))
  • 相关阅读:
    汪博士解读PMP考试
    ASP.NET编程实战宝典(光盘内容另行下载,地址见书封底)
    [模板]tarjan算法求SCC
    [POJ 3071]Football[概率DP]
    [数学]根式有理化[高中数学技巧]
    [平面几何]角格点问题
    [数学]对数均值不等式
    [模板][快速排序&归并排序]
    [POJ]P3126 Prime Path[BFS]
    每日一题_191219
  • 原文地址:https://www.cnblogs.com/yuxiangyang/p/11214375.html
Copyright © 2011-2022 走看看