zoukankan      html  css  js  c++  java
  • python--爬取豆瓣热门国产电视剧保存为文件

    # -*- coding: utf-8 -*-
    __author__ = 'Frank Li'
    import requests
    import json
    
    class HotSpider(object):
        def __init__(self):
            self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"
            self.session = requests.session()
            self.headers = {"Referer": "https://m.douban.com/tv/chinese",
                            "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"}
    
        def parse_2_list_from_str(self,url):
            return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items']
    
        def save_as_file(self,content_list,file):
            with open(file,'a',encoding='utf-8') as f:
                for content in content_list:
                    f.write(json.dumps(content,ensure_ascii=False))
                    f.write('
    ')
    
        def run(self):
            url = self.url.format(0)
            num = 0
            total = 500
            while num<total+18:
                print(url)
                self.save_as_file(self.parse_2_list_from_str(url),'hot.json')
                num+=18
                url=self.url.format(num)
    
    if __name__ == '__main__':
        hot_spider = HotSpider()
        hot_spider.run()
    
    
    

    使用 xpath 爬取正在热映的 电影保存为 json 文件

    # -*- coding: utf-8 -*-
    __author__ = 'Frank Li'
    import requests
    from lxml import etree
    import json
    
    url = "https://movie.douban.com/cinema/nowplaying/changsha/"
    headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    sess = requests.session()
    response = sess.get(url,headers=headers)
    html_str = response.content.decode()
    element = etree.HTML(html_str)
    movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")
    movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")
    movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")
    movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()")
    
    for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):
        item = {}
        item['name'] = name
        item['img'] = img
        item['addr'] = addr
        item['score'] = score
        with open('movie.json','a',encoding='utf-8') as f:
            item_json = json.dumps(item, ensure_ascii=False, indent=2)
            print(item_json)
            f.write(item_json)
            f.write('
    ')
            f.flush()
    
    

    保存下来的 movie.json 文件

    {
      "name": "碟中谍6:全面瓦解",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",
      "addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",
      "score": "8.3"
    }
    {
      "name": "阿尔法:狼伴归途",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",
      "addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",
      "score": "6.5"
    }
    {
      "name": "蚁人2:黄蜂女现身",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",
      "addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",
      "score": "7.5"
    }
    {
      "name": "传奇的诞生",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",
      "addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",
      "score": "7.6"
    }
    {
      "name": "快把我哥带走",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",
      "addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",
      "score": "7.0"
    }
    {
      "name": "道高一丈",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",
      "addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",
      "score": "5.7"
    }
    {
      "name": "李宗伟:败者为王",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",
      "addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",
      "score": "7.1"
    }
    {
      "name": "西虹市首富",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",
      "addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",
      "score": "6.7"
    }
    {
      "name": "一出好戏",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",
      "addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",
      "score": "7.3"
    }
    {
      "name": "精灵旅社3:疯狂假期",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",
      "addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",
      "score": "6.9"
    }
    {
      "name": "苏丹",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",
      "addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",
      "score": "7.0"
    }
    {
      "name": "巨齿鲨",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",
      "addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",
      "score": "6.0"
    }
    {
      "name": "藏北秘岭-重返无人区",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",
      "addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",
      "score": "6.2"
    }
    {
      "name": "那些女人",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",
      "addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",
      "score": "5.3"
    }
    {
      "name": "草戒指",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",
      "addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",
      "score": "5.6"
    }
    {
      "name": "吻隐者",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",
      "addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",
      "score": "7.6"
    }
    {
      "name": "禹神传之寻找神力",
      "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",
      "addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",
      "score": "6.6"
    }
    {
      "name": "大师兄",
      "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",
      "addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",
      "score": "6.2"
    }
    
    

    简单多线程 图片下载

    import requests
    from bs4 import BeautifulSoup
    import os
    import threading
    
    def download_img(src,target=None):
        parent_dir = './img'
        os.makedirs(parent_dir,exist_ok=True)
        r = requests.get(src,stream=True)
        target = src.split('/')[-1]
        target = os.path.join(parent_dir,target)
        print(threading.current_thread(),' start to download img: ',target)
        with open(target,'wb') as tar_file:
            for chunk in r.iter_content(chunk_size=128):
                tar_file.write(chunk)
            print('saved {}'.format(target))
            
    
            
    if __name__ == '__main__':
        URL = 'https://tieba.baidu.com/p/6034793219'
        html = requests.get(URL).text
        soup = BeautifulSoup(html,'lxml')
        # print(html)
        imgs = []
        srcs = soup.find_all('img',{'class':'BDE_Image'})
        for src in srcs:
            imgs.append(src['src'])
        
        threads = []
        for i,img in enumerate(imgs):
            t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        
    
    如果有来生,一个人去远行,看不同的风景,感受生命的活力。。。
  • 相关阅读:
    成为Emacs高手01-学习自带教程
    成为Emacs高手03-学习基础Elisp
    Google Drive For Linux
    Yet Another Scheme Tutorial 02
    1、Maven 基本配置
    eclipse添加easyExport插件,打开本地文件
    原创一看便知、Maven创建web项目
    1、启动oracle的步骤
    java正则表达式【大全】
    servlet上传下载(任何格式的都可以)
  • 原文地址:https://www.cnblogs.com/Frank99/p/9610069.html
Copyright © 2011-2022 走看看