zoukankan      html  css  js  c++  java
  • python+BeautifulSoup+多进程爬取糗事百科图片

    用到的库;

    import requests
    import os
    from bs4 import BeautifulSoup
    import time
    from multiprocessing import Pool

    定义图片存储路径;

    path = r'E:爬虫805\'

    请求头,模拟浏览器请求;

    在浏览器中的位置,按f12打开开发者模式;

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }

     主函数;

    def get_images(url):
        data = 'https:'
        res = requests.get(url,headers=headers)
        soup = BeautifulSoup(res.text,'lxml')
        url_infos = soup.select('div.thumb > a > img')
        # print(url_infos)
        for url_info in url_infos:
            try:
                urls = data+url_info.get('src')
                if os.path.exists(path+urls.split('/')[-1]):
                    print('图片已下载')
                else:
                    image = requests.get(urls,headers=headers)
                    with open(path+urls.split('/')[-1],'wb') as fp:
                        fp.write(image.content)
                    print('正在下载:'+urls)
                    time.sleep(0.5)
            except Exception as e:
                print(e)

    开始爬虫程序;

    if __name__ == '__main__':
        # 路由列表
        urls = ['https://www.qiushibaike.com/imgrank/page/{}/'.format(i) for i in range(1,14)]
        # 开启多进程爬取
        pool = Pool()
        pool.map(get_images,urls)
        print('抓取完毕')

    爬取中;

    打开文件夹查看爬取结果;

    done

    完整代码;

    import requests
    import os
    from bs4 import BeautifulSoup
    import time
    from multiprocessing import Pool
    """
    ************常用爬虫库***********
           requests
           BeautifulSoup
           pyquery 
           lxml
    ************爬虫框架***********
           scrapy
           三大解析方式:re,css,xpath
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }
    path = r'E:爬虫805\'
    def get_images(url):
        data = 'https:'
        res = requests.get(url,headers=headers)
        soup = BeautifulSoup(res.text,'lxml')
        url_infos = soup.select('div.thumb > a > img')
        # print(url_infos)
        for url_info in url_infos:
            try:
                urls = data+url_info.get('src')
                if os.path.exists(path+urls.split('/')[-1]):
                    print('图片已下载')
                else:
                    image = requests.get(urls,headers=headers)
                    with open(path+urls.split('/')[-1],'wb') as fp:
                        fp.write(image.content)
                    print('正在下载:'+urls)
                    time.sleep(0.5)
            except Exception as e:
                print(e)
    
    if __name__ == '__main__':
        # 路由列表
        urls = ['https://www.qiushibaike.com/imgrank/page/{}/'.format(i) for i in range(1,14)]
        # 开启多进程爬取
        pool = Pool()
        pool.map(get_images,urls)
        print('抓取完毕')
  • 相关阅读:
    App唤起微信小程序和回调
    微信小程序 — 自定义picker选择器弹窗内容+textarea穿透bug
    微信小程序的场景值scene
    微信小程序textarea层级过高(盖住其他元素)
    微信小程序如何修改本地缓存key中的单个数据
    微信小程序---查看更多的显示与隐藏
    微信小程序文字超过行后隐藏并且显示省略号
    Flutter 页面下拉刷新和上拉加载
    json转换成dart类 JSON to Dart
    Flutter 保持页面状态
  • 原文地址:https://www.cnblogs.com/nmsghgnv/p/11306815.html
Copyright © 2011-2022 走看看