zoukankan      html  css  js  c++  java
  • 福利爬虫妹子图之获取种子url

    import os
    import uuid
    from lxml import html
    import aiofiles
    import logging
    from ruia import Spider, Request
    from ruia_ua import middleware
    from aiohttp探究.db import MotorBase
    import datetime
    
    demo = "https://www.mzitu.com/page/{}/"
    
    
    class BaiduImgSpider(Spider):
        start_urls = []
        img_path = 'data/'
    
        async def parse(self, res):
            self.mongo_db = MotorBase().get_db('img_data')
            source = res.html
            root = html.fromstring(source)
            url_list = root.xpath("//ul[@id='pins']/li/a/@href")
            name_list = root.xpath("//ul[@id='pins']/li/a/img/@alt")
            next_page_urls = []
            headers = {
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'zh-CN,zh;q=0.9',
                'cache-control': 'max-age=0',
                'referer': 'https://www.mzitu.com/mm/',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            }
            for each_data in url_list:
                next_page_urls.append(each_data)
            for name, url in zip(name_list, next_page_urls):
                yield Request(url, headers=headers, callback=self.next_page, metadata={"name": name}, res_type='text')
    
        async def next_page(self, res):
            source = res.html
            root = html.fromstring(source)
            name = res.metadata.get("name")
            refere_url = res.url
            # print(name, refere_url)
            # 最后一页xpath
            max_page_list = "//div[@class='pagenavi']/a[last()-1]/span/text()"
            _max_page_num = root.xpath(max_page_list)
            max_page_num = _max_page_num[0] if _max_page_num else None
            img_url_node = root.xpath("//div[@class='main-image']/p/a/img/@src")
            img_url = img_url_node[0] if img_url_node else None
            headers = {
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'zh-CN,zh;q=0.9',
                'cache-control': 'max-age=0',
                'if-modified-since': 'Thu, 15 Nov 2018 04:24:11 GMT',
                'if-none-match': '"5becf4eb-1b7d4"',
                'referer': refere_url,
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            }
            datas = []
            # yield Request(img_url, callback=self.save_img, headers=headers,
            #               metadata={"url": img_url, "name": name, "id": "1"},
            #               res_type='bytes')
            data1 = {'url': img_url, "status": "0", 'title': name, "img_id": "1", "headers": headers,
                     "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
    
            datas.append(data1)
            # print("最大页数", max_page_num)
    
            for page in range(2, int(max_page_num) + 1):
                headers["referer"] = f"{refere_url}{str(page).zfill(2)}"
                next_img_url = img_url.replace("01.", f"{str(page).zfill(2)}.")
                # print("next",next_img_url)
                # yield Request(next_img_url, callback=self.save_img, headers=headers,
                #               metadata={"url": img_url, "name": name, "id": page},
                #               res_type='bytes')
                data2 = {'url': next_img_url, "status": "0", 'title': name, "img_id": page, "headers": headers,
                         "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
                datas.append(data2)
            await self.mongo_db.mzitu2.insert_many(datas)
    
        async def save_img(self, res):
            url = res.metadata.get("url")
            _img_type = url.rsplit(".", 1)
            img_type = _img_type[1] if _img_type else None
            name = res.metadata.get("name")
            img_id = res.metadata.get("id")
            img_all_path = f"{self.img_path}{name}/"
            if not os.path.exists(img_all_path):
                os.makedirs(img_all_path)
            # img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-')
            img_name = f"{img_id}.{img_type}"
            async with aiofiles.open(img_all_path + img_name, 'wb') as fp:
                await fp.write(res.html)
                logging.info('Img downloaded successfully in {dir}'.format(dir=img_all_path + img_name))
    
    
    if __name__ == '__main__':
        word = '妹子图'  # 目录名
        pages = 201  # 页数
        BaiduImgSpider.img_path = word + "/"
        BaiduImgSpider.start_urls = [demo.format(page) for page in range(pages)]
        BaiduImgSpider.start(middleware=middleware)
    

    db.py

    import asyncio
    
    from motor.motor_asyncio import AsyncIOMotorClient
    
    
    class MotorBase:
        """
        About motor's doc: https://github.com/mongodb/motor
        """
        _db = {}
        _collection = {}
    
        def __init__(self, loop=None):
            self.motor_uri = ''
            self.loop = loop or asyncio.get_event_loop()
    
        def client(self, db):
            # motor
            self.motor_uri = f"mongodb://localhost:27017/{db}"
            return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop)
    
        def get_db(self, db='test'):
            """
            Get a db instance
            :param db: database name
            :return: the motor db instance
            """
            if db not in self._db:
                self._db[db] = self.client(db)[db]
    
            return self._db[db]
    
    
  • 相关阅读:
    使用maven管理后,依然找不到需要的jar包
    ftp linux-500 OOPS问题解决-jooyong-ChinaUnix博客
    Linux命令 理解
    c++大数模板
    二分匹配专辑
    fzu2157(树形dp)
    fzu2158
    csu1356 :判断一个环是否为奇数环
    rmq模板
    zoj 3761(并查集+搜索)
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10014425.html
Copyright © 2011-2022 走看看