zoukankan      html  css  js  c++  java
  • 实战:requests和pyquery爬取美女图片

    实战:使用PyQuery和Requests爬取美女图片

    前提:已安装request库、PyQuery、mongodb、pymongo

    ##config文件
    MONGO_URL = 'localhost'
    MONGO_DB = 'uumtu'
    MONGO_TABLE = 'uumtu'
    KEYWORD='mxmn'
    import requests
    from requests.exceptions import RequestException
    from pyquery import PyQuery as pq
    import re
    from config import *
    import pymongo
    from hashlib import md5
    import os
    from multiprocessing import Pool
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    baseurl = 'https://www.uumnt.cc'
    
    def get_page_index(offset,keyword):
        """
        这个函数返回指定主题、页码的html内容
        :param offset: 这个主题中的第几页
        :param keyword: 哪一个主题
        :return:
        """
        if offset != 1:
            url = 'https://www.uumnt.cc/%s/%s'%(str(keyword),'list_%s.html'%str(offset))
        else:
            url = 'https://www.uumnt.cc/%s'%(str(keyword),)
    
        print(url)
        try:
    
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print('请求页面出错')
    
    
    def parse_page_index(html):
        """
    
        :param html: 传入的html是包含很多组图的入口页面
        :return: 返回url的生成器
        #contbody > div:nth-child(7) > div > h1
        """
        #try:
        doc = pq(html)
        labelAs = doc('#mainbodypul .listmainrows')
        #print(type(labelAs.find('a')))
        for aa in labelAs.find('a'):
            a1=pq(aa)
            yield baseurl  + a1.attr.href
    
    
    def save_pic(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd()+"/minxin", md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    def download_image(url):
        print(url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                html = pq(response.text)
                src = html('div.center a img').attr.src
                print(src)
                print('Downloading image ',url)
                '''
                https://newimg.uumnt.cc:8092/Pics/2017/1112/02/02.jpg
                https://newimg.uumnt.cc:8092/Pics/2017/1120/09/01.jpg
                '''
                headers = {
                    'referer': 'https://www.uumtu.com/siwa/23573_3.html',
                    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
                }
                image_page = requests.get(src,headers=headers)#打开真正的图片
                print(image_page.headers)
                if image_page.status_code == 200:
                    save_pic(image_page.content)
            return None
        except RequestException:
            print('请求页面出错')
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                doc = pq(response.text)
    
                h1_text = (doc('div.center h1.center').text())
                print(h1_text)
                titlePatten = re.compile('(.*?)(.*?/(d+))', re.S)
                mymatch = re.search(titlePatten, h1_text)
                #print(title1.group(1),title1.group(2))
                title=mymatch.group(1)
                count = int(mymatch.group(2))
                image_urls =[]
    
                for i in range(1,count):
                    pass
                    image_urls.append(url[:-5] + '_' + str(i) + '.html')
                return {
                    'title':title,
                    'count':count,
                    'url':url,
                    'images':image_urls
                }
            return None
        except RequestException:
            print('请求页面出错')
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    def main(offset):
    
        response = get_page_index(offset,KEYWORD)
        for a in parse_page_index(response):
            mydic = get_page_detail(a)
            if mydic:
                save_to_mongo(mydic)
                lenth = len(mydic['images'])
                for index in range(lenth):
                    download_image(mydic['images'][index])
    if __name__ == '__main__':
        pool = Pool()
        #groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        #pool.map(main, groups)
        groups = ([x+1 for x in range(82)])
        print(groups)
        pool.map(main, groups)
        pool.close()
        pool.join()
  • 相关阅读:
    2019.6.20刷题统计
    36 线程 队列 守护线程 互斥锁 死锁 可重入锁 信号量
    35 守护进程 互斥锁 IPC 共享内存 的方式 生产者消费者模型
    34 进程 pid ppid 并发与并行,阻塞与非阻塞 join函数 process对象 孤儿进程与僵尸进程
    33 udp 域名 进程
    32 粘包 文件传输
    31 socket客户端. 服务器 异常 语法
    30 网络编程
    29 元类 异常
    26 封装 反射 常用内置函数
  • 原文地址:https://www.cnblogs.com/x00479/p/14249025.html
Copyright © 2011-2022 走看看