zoukankan      html  css  js  c++  java
  • 练习4-今日头条爬取

    import requests
    from urllib.parse import urlencode
    from pyquery import PyQuery as pq
    import re,os
    from hashlib import md5
    
    def get_page(page_num,search_id):
        param1={
            'dvpf': 'pc',
            'source':'input',
            'keyword': '街拍'
        }
        param2 = {
            'keyword': '街拍',
            'pd': 'synthesis',
            'source': 'pagination',
            'dvpf': 'pc',
            'aid': 4916,
            'page_num': page_num,
            'search_id': search_id
        }
        if page_num == 0:
            param=param1
        else:
            param = param2
        url = 'https://so.toutiao.com/search?' + urlencode(param)
        try:
            reponse = requests.get(url)
            if reponse.status_code == 200:
                return reponse.text
        except Exception as e:
            print('ERROR1:', e)
    
    def parse_pg(html):
        doc=pq(html)
        imgs=doc('.abs-fill img').items()
        for img in imgs:
            src=img.attr('src')
            print(src)
            yield  src
    
    
    def save_img(img):
        if not os.path.exists(r'D:pycharm_projects街拍'):
            os.mkdir(r'D:pycharm_projects街拍')
        try:
            response=requests.get(img)
            if response.status_code ==200:
                file_path='{}/{}.{}'.format(r'D:pycharm_projects街拍',md5(response.content).hexdigest(),'jpg')
                if not os.path.exists(file_path):
                    with open(file_path,'wb') as f:
                        f.write(response.content)
                else:
                    print('alredy download')
        except Exception as e:
            print('ERROR2:',e)
    
    
    def main():
        search_id=''
        for i in range(2):
            if i == 0:
                html = get_page(i, search_id)
                doc = pq(html)
                search_id = re.search(r'search_id=(.*)&?',doc('.result-content:last-child a:first-child').attr('href')).group(1)
            else:
                html = get_page(i, search_id)
            imgs=parse_pg(html)
            for img in imgs:
                print(img)
                save_img(img)
    
    if __name__ == '__main__':
        main()
    
    
  • 相关阅读:
    selenium自动化测试资源整理
    python获取目录下文件夹名称
    Appium-测试失败后屏幕截图的
    appium 多个设备同时执行
    七 Appium常用方法介绍
    六 APPIUM Android 定位方式
    Python运维开发基础08-文件基础
    Python运维开发基础09-函数基础
    Python运维开发基础06-语法基础
    Python运维开发基础07-文件基础
  • 原文地址:https://www.cnblogs.com/tingshu/p/14773354.html
Copyright © 2011-2022 走看看