zoukankan      html  css  js  c++  java
  • datawhale爬虫task01

    #使用requests、正则表达式,爬取豆瓣电影top250排行榜
    #要求抓取名次、影片名称、年份、导演等字段。
    
    
    import requests
    import re
    import csv
    import time
    class doubanTop250():
    
        film_list = []
    
        #1.发送请求
        def send_request(self,url):
            #1.1添加请求头
            headers= {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
            #1.3 发送请求
            response = requests.get(url=url,headers=headers)
            print(response.status_code)
            return response
    
        #2.解析数据
        def parse(self,response):
            data = response.content.decode()
            rank  = re.findall('<em class="">(d+)</em>', data)
            name = re.findall('<img width="100" alt="(.*) src=', data)
            country =  re.findall('&nbsp;/&nbsp;(.*)&nbsp;/&nbsp;', data)
            director = re.findall('导演:(.*)', data)
            score = re.findall('<span class="rating_num" property="v:average">(.*)</span>', data)
            for i in range(0, len(rank)):
                film_dict = {}
                film_dict['rank'] = rank[i]
                film_dict['name'] = name[i]
                film_dict['country'] = country[i]
                film_dict['director'] = director[i]
                film_dict['score'] = score[i]
                self.film_list.append(film_dict)
    
    
        #3.存储数据
        def save_data(self):
            #0.创建开启文件
            csv_file = open('top250.csv', 'w', encoding='utf-8')
            #1.创建csv写入器
            csv_writer = csv.writer(csv_file)
            #2.写入表头
            csv_writer.writerow(self.film_list[0].keys())
            #3.写入内容
            csv_list = []
            for film in self.film_list:
                film_data = film.values()
                csv_list.append(film_data)
            csv_writer.writerows(csv_list)
            #4.关闭文件
            csv_file.close()
    
            pass
        #4.运行
        def run(self):
            # 1.1目标url地址
            # 拼接url
            base_url = "https://movie.douban.com/top250?start="
            for i in range(0,225,25):
                final_url = base_url + str(i)
                #1.发送请求,返回response对象
                response = self.send_request(final_url)
    
                #2.解析response数据
                self.parse(response)
                time.sleep(5)
            #3.保存数据
            self.save_data()
    
    
    
    doubanTop250().run()
  • 相关阅读:
    vue--vue-resource实现 get, post, jsonp请求
    vue--生命周期演示
    vue--自定义指令
    vue--键盘修饰符以及自定义键盘修饰符
    vue--过滤器(私有,全局)
    mysql5.7二进制包安装方式
    搭建GIT服务器
    服务器集群,及服务器高并发调优备忘
    iptables 配置问题,以及centos firewall 配置
    nginx 编译安装
  • 原文地址:https://www.cnblogs.com/tommyngx/p/11312172.html
Copyright © 2011-2022 走看看