zoukankan      html  css  js  c++  java
  • python 爬虫005-爬虫实例

    实例一:扒取猫眼电影TOP100 的信息

    #!/usr/bin/env python 
    # -*- coding: utf-8 -*-
    """
    扒取猫眼电影TOP100 的信息
    """
    
    import re
    import json
    from multiprocessing import Pool
    
    import requests
    from requests.exceptions import RequestException
    
    
    def get_page_content(url):
        """
        获取页面源码
        :param url:
        :return:
        """
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    
    def parse_html(html):
        """
        使用正则表达式解析源码
        :param html:
        :return:
        """
        pattern = re.compile(
            '<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?class="name"><a.*?>(.*?)</a>.*?class="score"><i.*?>(.*?)</i><i.*?>(.*?)</i>.*?</dd>',
            re.S)
        items = re.findall(pattern, html)
        for item in items:
            yield {
                'index': item[0],
                'image': item[1],
                'title': item[2],
                'score': item[3] + item[4]
            }
    
    
    def write_to_file(dic):
        """
        写入文件
        :param dic:
        :return:
        """
        with open('result', 'a', encoding='utf-8') as f:
            f.write(json.dumps(dic, ensure_ascii=False) + "
    ")
    
    
    def main(offset):
        """
        主函数
        :param offset:
        :return:
        """
        url = 'http://maoyan.com/board/4?offset=' + str(offset)
        html = get_page_content(url)
        for item in parse_html(html):
            print(item)
            write_to_file(item)
    
    
    if __name__ == '__main__':
        # 通过multiprocessing.Pool 创建多线程并发执行。
        pool = Pool(4)
        pool.map(main, [i * 10 for i in range(10)])
        pool.close()
        pool.join()

    实例二:扒取头条网中的街拍图片

    #!/usr/bin/env python 
    # -*- coding: utf-8 -*-
    """
    扒取头条网中的街拍图片
    """
    import requests
    import os
    from hashlib import md5
    from multiprocessing import Pool
    
    
    def get_page(offset):
        """
        搜索接口返回json结果
        :param offset:
        :return:
        """
        url = 'http://www.toutiao.com/search_content'
        params = {
            'offset': offset,
            'format': 'json',
            'keyword': '街拍',
            'autoload': 'true',
            'count': 20,
            'cur_tab': 1,
            'from': 'search_tab'
        }
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response.json()
        except Exception as e:
            print(e)
            return None
    
    
    def get_images(json):
        """
        解析json,获取图片信息迭代器
        :param json:
        :return:
        """
        if json.get("data"):
            for item in json.get("data"):
                title = item.get("title")
                images = item.get("image_detail")
                if images:
                    for image in images:
                        url = image.get("url")
                        yield {
                            'title': title,
                            'image': url
                        }
    
    
    def download_image(item):
        """
        保存图片到本地
        :param item:
        :return:
        """
        url = item.get("image")
        title = item.get("title")
        if not os.path.exists(title):
            try:
                os.mkdir(title)
            except Exception as e:
                print(e)
        file_path = os.path.join(title, md5(
            item.get("image").encode("utf-8")).hexdigest() + ".jpg")
        if not os.path.exists(file_path):
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    with open(file_path, 'wb') as fp:
                        fp.write(response.content)
            except Exception as e:
                print('Failed to Save Image')
                print(e)
    
    
    def main(offset):
        """
        主要函数
        :param offset:
        :return:
        """
        # step1.获取街拍搜索api的json返回
        content = get_page(offset)
        # step2.获取每条搜索记录的title与图片地址,并且遍历
        for item in get_images(content):
            print(item)
            # step3.下载图片
            download_image(item)
    
    
    if __name__ == '__main__':
        GROUP_START = 1
        GROUP_END = 2
        offsets = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        # Make the Pool of workers
        pool = Pool(4)
        pool.map(main, offsets)
        # close the pool and wait for the work to finish
        pool.close()
        pool.join()

    实例三:扒取妹子图中的所有图片

    #!/usr/bin/python
    # coding=utf-8
    
    """
    爬取妹子网图片到本地
    """
    import os
    import requests
    from pyquery import PyQuery as pq
    from multiprocessing import Pool
    import time
    
    
    class MeiZi:
        def __init__(self):
            self.index_url = "http://www.mzitu.com/"
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
                'Referer': self.index_url
            }
    
        def get_document(self, url):
            """
            获取指定页面的DOM文档,并使用pyquery解析
            :param url:
            :return:
            """
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return pq(response.text)
                else:
                    print("[页面访问失败]:status_code:%s,url:%s" % (
                        response.status_code, url))
            except Exception as e:
                print(e)
    
        def index_page_range(self):
            """
            获取首页翻页总数迭代器
            :return:
            """
            doc = self.get_document(self.index_url)
            if doc:
                page_count = doc('.nav-links a:nth-last-child(2)').text()
                if page_count:
                    return range(int(page_count))
    
        def get_items(self, page):
            """
            获取一级列表页中的主题页标题与主题页ID
            :param html:
            :return:
            """
            doc = self.get_document(self.index_url + "page/" + str(page))
            if doc:
                items = doc('#pins li span a').items()
                if items:
                    for item in items:
                        yield {
                            'item_url': item.attr('href'),
                            'item_title': item.text()
                        }
    
        def save_item_details(self, item):
            """
           保存二级专题页中的图片
            :param detail_url:
            :return:
            """
            # 获取并解析二级专题页信息
            item_url = item.get("item_url")
            item_title = item.get("item_title")
            doc = self.get_document(item_url)
            if doc:
                # 获取二级专题页中的翻页总数
                page_count = doc('.pagenavi a:nth-last-child(2)').text()
                # 获取二级专题页中的图片地址
                img_src = doc('.main-image img').attr('src')
                # 保存第一页图片
                self.save_image(item_title, img_src)
                # 如果还有翻页,獲取保存分頁中的圖片
                page = 1
                while page < int(page_count):
                    page += 1
                    detail_url = item_url + "/" + str(page)
                    doc = self.get_document(detail_url)
                    if doc:
                        img_src = doc('.main-image img').attr('src')
                        self.save_image(item_title, img_src)
    
        def save_image(self, title, img_src):
            """
            保存图片img_src到本地,根据title命名文件夹
            :param title:
            :param img_src:
            :return:
            """
            root = "F:\meizitu"
            # 保存到的文件夾,如果不存在,則創建
            save_dir = os.path.join(root, title)
            if not os.path.exists(save_dir):
                try:
                    os.makedirs(save_dir)
                except Exception as e:
                    print("[創建文件夾失敗]:%s" % save_dir)
                    print(e)
            # 如果圖片不存在,則下載並且保存圖片
            pic_name = os.path.basename(img_src)
            pic_path = os.path.join(save_dir, pic_name)
            if not os.path.exists(pic_path):
                try:
                    response = requests.get(img_src, headers=self.headers)
                    if response.status_code == 200:
                        with open(pic_path, 'wb') as f:
                            f.write(response.content)
                            print(pic_path)
                    else:
                        print("[图片访问失败]status_code:%s,url:%s" % (
                            response.status_code, img_src))
                except Exception as e:
                    print(e)
    
        def main(self, page):
            # 获取一级页面中的专题
            items = self.get_items(page)
            # 遍历一级专题,获取二级页面中的图片,保存图片
            for item in items:
                self.save_item_details(item)
    
    
    if __name__ == '__main__':
        start_time = time.time()
        mz = MeiZi()
        pool = Pool(10)
        page_count = mz.index_page_range()
        pool.map(mz.main, page_count)
        pool.close()
        pool.join()
        print("times:", time.time() - start_time)

    实例四:扒取1688商品宝贝的数据包

    #!/usr/bin/python
    # coding=utf-8
    
    """
    扒取1688淘宝图片到本地
    """
    import os
    import requests
    from pyquery import PyQuery as pq
    import json
    import re
    
    
    def save_image(root, pic_name, img_src):
        """ 保存图片到本地
        :param root: 保存位置文件夹
        :param pic_name:保存图片名称,如a.jpg
        :param img_src:图片源地址
        :return:
        """
    
        # 保存到的文件夾,如果不存在,則創建
        if not os.path.exists(root):
            try:
                os.makedirs(root)
            except Exception as e:
                print("[創建文件夾失敗]:%s" % root)
                print(e)
        # 如果圖片不存在,則下載並且保存圖片
        pic_path = os.path.join(root, pic_name)
        if not os.path.exists(pic_path):
            try:
                response = requests.get(img_src)
                if response.status_code == 200:
                    with open(pic_path, 'wb') as f:
                        f.write(response.content)
                        print("下载完成:", pic_path)
                else:
                    print("[图片访问失败]status_code:%s,url:%s" % (
                        response.status_code, img_src))
            except Exception as e:
                print(e)
    
    
    def main(root, detail_url, cookie):
        """ 下载数据包主函数"""
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
            # 用户cookie
            'Accept-Encoding': 'gzip, deflate, br',
            'upgrade-insecure-requests': '1',
            'cookie': cookie,
        }
        resp = requests.get(url=detail_url, headers=header)
        doc = pq(resp.text)
        # 保存主图
        images_show = doc('.content li.tab-trigger')
        if images_show:
            loop = 1
            for image in images_show:
                src = json.loads(image.get('data-imgs')).get('original')
                pic_type = os.path.splitext(src)[1]
                save_image(root, '主图_%s%s' % (loop, pic_type), src)
                loop += 1
        # 保存详情图:
        images_detail = doc('#desc-lazyload-container')
        detail = images_detail.attr('data-tfs-url')
        resp = requests.get(url=detail)
        if resp.status_code == 200:
            src_match = re.findall('<img.*?(https://.*?.(jpg|png|jpeg|gif))',
                                   resp.text)
            loop = 1
            for src in src_match:
                save_image(root, '详情_%s.%s' % (loop, src[1]), src[0])
                loop += 1
    
    
    if __name__ == '__main__':
        # 设置图片保存的目录,注意每次都需要修改到不同的目录下,避免图片覆盖
        root = 'F:\ShopData\777'
        # 准确填写 1688宝贝详情页地址
        detail_url = 'https://detail.1688.com/offer/36413052665.html?spm=b26110380.sw1688.mof001.34.xEXARY'
        # 手工去登录1688网站后,将浏览器的cookie拷贝到这里
        cookie = 'XXXXXXXXXXXX'
        # 开始执行扒取...
        main(root, detail_url, cookie)
        print("数据包下载完成!")
    ***微信扫一扫,关注“python测试开发圈”,了解更多测试教程!***
  • 相关阅读:
    VRChat之blender教程
    29(30).socket网络基础
    26(27).反射及面向对象进阶
    25.python之面向对象
    24.configparser&hashlib
    23.logging
    22.re(正则表达式)
    22.XML
    java日志系统 @Slf4j注解的正确使用
    java四种元注解:@Retention @Target @Document @Inherited)认知
  • 原文地址:https://www.cnblogs.com/guanfuchang/p/8145474.html
Copyright © 2011-2022 走看看