zoukankan      html  css  js  c++  java
  • 爬虫2(二)

    一,校花网图片

    from requests_html import HTMLSession
    import os
    
    
    class spider():
        def __init__(self):
            self.session = HTMLSession()
            self.headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
            }
    
        def get_index_url(self):
            for i in range(1,4):
                if i == 1:
                    yield 'http://www.xiaohuar.com/meinv/index.html'
                else:
                    yield 'http://www.xiaohuar.com/meinv/index_%s.html'%i
    
        def get_img_name(self,index_url):
            r = self.session.get(url=index_url,headers=self.headers)
            elements_list = r.html.find('#images .items')
            for element in elements_list:
                img_url:str = element.find('img',first=True).attrs.get('src')
                if not img_url.startswith('http'):
                    img_url = 'http://www.xiaohuar.com' + img_url
                img_name = element.find('.p_title>a',first=True).text.replace('\','').replace('/','') + '.jpg'
                yield img_url,img_name
    
        def save_img(self,img_url,img_name):
            r = self.session.get(url=img_url)
            img_path= os.path.join('校花图片',img_name)
            with open(img_path,'wb') as f:
                f.write(r.content)
                print('%s下载完毕'%img_name)
    
    
    
    
    
    
        def run(self):
            for index_url in self.get_index_url():
                for img_url,img_name in self.get_img_name(index_url):
                    self.save_img(img_url,img_name)
    
    
    if __name__ == '__main__':
        xiaohua = spider()
        xiaohua.run()

    二.豆瓣

    from requests_html import HTMLSession
    
    #测试
    # session = HTMLSession()
    # url='https://movie.douban.com/tag/#/?sort=S&range=0,10&tags=2018'
    #
    # r = session.get(url=url)
    # print(r.text)
    
    #电影    %E7%94%B5%E5%BD%B1
    
    # print(str('电影'.encode('utf-8')).strip("'b").replace('\x','%').upper())
    
    
    class spider():
        def __init__(self):
            self.api = 'https://movie.douban.com/j/new_search_subjects?'
            self.session = HTMLSession()
    
        def get_params(self):
            year_range = input('输入年份')    #年份是一个区间,逗号隔开
            sort = input('输入排序规则(S按评分)')
    
            self.params = {
                'year_range':year_range,
                'sort':sort,
                'start':0
            }
    
        def get_data(self):
            for i in range(10):
                self.params['start'] = i*20
                r = self.session.get(url=self.api,params=self.params)
                print(r.json())
    
        def run(self):
            self.get_params()
            self.get_data()
    
    if __name__ == '__main__':
        douban = spider()
        douban.run()

    三.校花网视频

    from requests_html import HTMLSession
    import os
    
    class spider():
    
        def __init__(self):
            self.session = HTMLSession()
    
    
        def get_index_page(self):
            for i in range(7):
                url = 'http://www.xiaohuar.com/list-3-%s.html'%i
                yield url
    
        def parse_index_page(self,index_page):
            r = self.session.get(url=index_page)
            elements_list = r.html.find('#images .items a[class="imglink"]')
            for element in elements_list:
                yield element.attrs.get('href')
    
        def parse_detail_page(self,detail_page):
            r = self.session.get(url=detail_page)
            r.html.encoding = 'GBK'
            result_obj = r.html.search('var vHLSurl    = "{}";')
            if result_obj:
                m3u8_url = result_obj[0]
                m3u8_name = r.html.find('title',first=True).text.replace('\','')
                yield m3u8_url,m3u8_name
            else:
                print("匹配失败,无资源")
    
        def save_m3u8(self,m3u8_url,m3u8_name):
            m3u8_dir = m3u8_name
            if not os.path.exists(m3u8_dir):
                os.mkdir(m3u8_dir)
            print(m3u8_url)
            r = self.session.get(url=m3u8_url)
            m3u8_path = os.path.join(m3u8_dir,'playlist.m3u8')
            with open(m3u8_path,'wt+',encoding='utf-8') as f :
                f.write(r.text)
                f.seek(0,0)
                for line in f:
                    line = line.strip()
                    if line.endswith('.ts'):
                        ts_url = os.path.dirname(m3u8_url) + '/%s'%line
                        r = self.session.get(url=ts_url)
                        ts_path =  os.path.join(m3u8_dir,line)
                        with open(ts_path,'wb') as f1:
                            f1.write(r.content)
                            print('%s下载完毕'%line)
    
        def run(self):
            for url in self.get_index_page():
                for detail_page in  self.parse_index_page(url):
                    for m3u8_url,m3u8_name in self.parse_detail_page(detail_page):
                        self.save_m3u8(m3u8_url,m3u8_name)
    
    
    if __name__ == '__main__':
        xioahua = spider()
        xioahua.run()

    四.tmall

    from requests_html import HTMLSession
    
    class spider():
    
        def __init__(self):
            self.session = HTMLSession()
            self.api= 'http://list.tmall.com/search_product.htm?'
    
        def get_params(self):
            pro = input("输入你要爬取的商品:")
            self.params = {
                'q':pro,
                'totalPage':1,
                'jumpto':1
            }
    
        def get_totalPage(self):
            r = self.session.get(url=self.api,params=self.params)
            totalPage = r.html.find('[name="totalPage"]',first=True).attrs.get('value')
            self.params['totalPage'] = int(totalPage)
    
        def get_pro_info(self):
            for i in range(1,self.params['totalPage']+1):
                self.params['jumpto'] = i
                r = self.session.get(url=self.api, params=self.params)
                elements_pro_list = r.html.find('.product')
                for element_pro in elements_pro_list:
                    title = element_pro.find('.productTitle a',first=True).text
                    price = element_pro.find('.productPrice em',first=True).attrs.get('title')
                    print(title)
                    print(price)
                    print('-'*30)
    
        def run(self):
            self.get_params()
            self.get_totalPage()
            self.get_pro_info()
    
    if __name__ == '__main__':
        tmall = spider()
        tmall.run()

     

  • 相关阅读:
    第5节 两牵引轴同步运动
    第4节 动一个牵引轴
    第3节 电控配置简介
    第2节 控制方案的制定
    第1节 中型PLC基本编程思路
    1200与VM(主动)之间的TCP/IP通讯
    西门子1200和温度计的模拟量应用
    西门子1200的高速计数功能和增量编码器功能
    西门子1200和V90之间(位置模式)的PID应用
    面试题68
  • 原文地址:https://www.cnblogs.com/sima-3/p/11699601.html
Copyright © 2011-2022 走看看