zoukankan      html  css  js  c++  java
  • 爬虫实例

    1. 爬校花网图片:

    这是一个很基础的爬虫小例子,爬取校花网的图片。其中用了requests-html库:

    先获取每页的url,再爬取每页中的图片进行解析处理,最后存入到文件中

    from requests_html import HTMLSession
    import os
    
    class Spider():
        def __init__(self):
            self.session = HTMLSession()
    
        def get_index_page(self):
            for i in range(1, 4):
                if i == 1:
                    page_url = "http://www.xiaohuar.com/meinv/index.html"
                else:
                    page_url = "http://www.xiaohuar.com/meinv/index_%s.html" % i
                yield page_url
    
        def get_image_name(self, page_url):
            r = self.session.get(url=page_url)
            elements_list = r.html.find('#images .items')
            for element in elements_list:
                image_url = element.find('img', first=True).attrs.get('src')
                image_name = element.find('.p_title a', first=True).text
                image_name = image_name.replace('【', '').replace('】', '').replace('|', '').replace('\', '').replace(
                    '/','') + '.jpg'
                yield image_url, image_name
    
        def save(self, image_url, image_name):
            image_name = os.path.join('pictures', image_name)
            if not image_url.startswith('http'):
                image_url = 'http://www.xiaohuar.com' + image_url
            r = self.session.get(url=image_url)
            with open(image_name, 'wb') as f:
                f.write(r.content)
                print('%s下载完成' % image_name)
    
        def run(self):
            for page_url in self.get_index_page():
                for image_url, image_name in self.get_image_name(page_url):
                    self.save(image_url, image_name)
    
    
    if __name__ == '__main__':
        xiaohua = Spider()
        xiaohua.run()
    

    2. 豆瓣电影排行信息

    爬虫获取豆瓣电影信息:在这里对电影进行筛选是通过url携带参数。所以先获取参数信息。

    from requests_html import HTMLSession
    
    class Spider:
        def __init__(self):
            self.api = "https://movie.douban.com/j/new_search_subjects?"
            self.session = HTMLSession()
    
        def get_params(self):
            sort = input('请输入按什么排序(S评分)')
            year_range = input('请输入年份:')
            self.params = {
                'sort':sort,
                'year_range':year_range,
                'start':0
            }
    
    
        def get_message(self):
            for i in range(10):
                self.params['start'] = i * 20
                r = self.session.get(url=self.api,params=self.params)
                print(r.json())
    
        def run(self):
            self.get_params()
            self.get_message()
    if __name__ == '__main__':
        douban = Spider()
        douban.run()
    

    3. 爬取校花视频

    校花视频是通过m3u8格式。有的视频会员有反爬机制,查看元素的播放链接是unknown,所以获取不到资源。

    现获取到m3u8格式的播放列表连接,然后发送请求获取到的文件内容是一行行.ts。.ts也是一个文件格式。对m3u8文件内容就行处理,发送请求下载.ts文件保存

    from requests_html import HTMLSession
    import os
    
    class spider():
    
        def __init__(self):
            self.session = HTMLSession()
    
        def get_index_page(self):
            for i in range(7):
                url = 'http://www.xiaohuar.com/list-3-%s.html'%i
                yield url
    
        def parse_index_page(self,index_page):
            r = self.session.get(url=index_page)
            elements_list = r.html.find('#images .items a[class="imglink"]')
            for element in elements_list:
                yield element.attrs.get('href')
    
        def parse_detail_page(self,detail_page):
            r = self.session.get(url=detail_page)
            r.html.encoding = 'GBK'
            result_obj = r.html.search('var vHLSurl    = "{}";')
            if result_obj:
                m3u8_url = result_obj[0]
                m3u8_name = r.html.find('title',first=True).text.replace('\','')
                yield m3u8_url,m3u8_name
            else:
                print("匹配失败,无资源")
    
        def save_m3u8(self,m3u8_url,m3u8_name):
            m3u8_dir = m3u8_name
            if not os.path.exists(m3u8_dir):
                os.mkdir(m3u8_dir)
            print(m3u8_url)
            r = self.session.get(url=m3u8_url)
            m3u8_path = os.path.join(m3u8_dir,'playlist.m3u8')
            with open(m3u8_path,'wt+',encoding='utf-8') as f :
                f.write(r.text)
                f.seek(0,0)
                for line in f:
                    line = line.strip()
                    if line.endswith('.ts'):
                        ts_url = os.path.dirname(m3u8_url) + '/%s'%line
                        r = self.session.get(url=ts_url)
                        ts_path =  os.path.join(m3u8_dir,line)
                        with open(ts_path,'wb') as f1:
                            f1.write(r.content)
                            print('%s下载完毕'%line)
    
        def run(self):
            for url in self.get_index_page():
                for detail_page in  self.parse_index_page(url):
                    for m3u8_url,m3u8_name in self.parse_detail_page(detail_page):
                        self.save_m3u8(m3u8_url,m3u8_name)
    
    
    if __name__ == '__main__':
        xioahua = spider()
        xioahua.run()
    
  • 相关阅读:
    (十三)页面权限控制
    (十二)用户管理模块
    Vue笔记:生命周期和钩子函数
    (十一)第三方图标库
    (十)动态加载菜单
    windows下php配置环境变量
    docker在mac下安装及配置阿里云镜像加速
    pm2-web监控
    PHP判断两个矩形是否相交
    ubuntu下安装ffmpeg扩展
  • 原文地址:https://www.cnblogs.com/863652104kai/p/11704162.html
Copyright © 2011-2022 走看看