zoukankan      html  css  js  c++  java
  • 实战:requests和pyquery爬取美女图片

    实战:使用PyQuery和Requests爬取美女图片

    前提:已安装request库、PyQuery、mongodb、pymongo

    ##config文件
    MONGO_URL = 'localhost'
    MONGO_DB = 'uumtu'
    MONGO_TABLE = 'uumtu'
    KEYWORD='mxmn'
    import requests
    from requests.exceptions import RequestException
    from pyquery import PyQuery as pq
    import re
    from config import *
    import pymongo
    from hashlib import md5
    import os
    from multiprocessing import Pool
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    baseurl = 'https://www.uumnt.cc'
    
    def get_page_index(offset,keyword):
        """
        这个函数返回指定主题、页码的html内容
        :param offset: 这个主题中的第几页
        :param keyword: 哪一个主题
        :return:
        """
        if offset != 1:
            url = 'https://www.uumnt.cc/%s/%s'%(str(keyword),'list_%s.html'%str(offset))
        else:
            url = 'https://www.uumnt.cc/%s'%(str(keyword),)
    
        print(url)
        try:
    
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print('请求页面出错')
    
    
    def parse_page_index(html):
        """
    
        :param html: 传入的html是包含很多组图的入口页面
        :return: 返回url的生成器
        #contbody > div:nth-child(7) > div > h1
        """
        #try:
        doc = pq(html)
        labelAs = doc('#mainbodypul .listmainrows')
        #print(type(labelAs.find('a')))
        for aa in labelAs.find('a'):
            a1=pq(aa)
            yield baseurl  + a1.attr.href
    
    
    def save_pic(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd()+"/minxin", md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    def download_image(url):
        print(url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                html = pq(response.text)
                src = html('div.center a img').attr.src
                print(src)
                print('Downloading image ',url)
                '''
                https://newimg.uumnt.cc:8092/Pics/2017/1112/02/02.jpg
                https://newimg.uumnt.cc:8092/Pics/2017/1120/09/01.jpg
                '''
                headers = {
                    'referer': 'https://www.uumtu.com/siwa/23573_3.html',
                    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
                }
                image_page = requests.get(src,headers=headers)#打开真正的图片
                print(image_page.headers)
                if image_page.status_code == 200:
                    save_pic(image_page.content)
            return None
        except RequestException:
            print('请求页面出错')
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                doc = pq(response.text)
    
                h1_text = (doc('div.center h1.center').text())
                print(h1_text)
                titlePatten = re.compile('(.*?)(.*?/(d+))', re.S)
                mymatch = re.search(titlePatten, h1_text)
                #print(title1.group(1),title1.group(2))
                title=mymatch.group(1)
                count = int(mymatch.group(2))
                image_urls =[]
    
                for i in range(1,count):
                    pass
                    image_urls.append(url[:-5] + '_' + str(i) + '.html')
                return {
                    'title':title,
                    'count':count,
                    'url':url,
                    'images':image_urls
                }
            return None
        except RequestException:
            print('请求页面出错')
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    def main(offset):
    
        response = get_page_index(offset,KEYWORD)
        for a in parse_page_index(response):
            mydic = get_page_detail(a)
            if mydic:
                save_to_mongo(mydic)
                lenth = len(mydic['images'])
                for index in range(lenth):
                    download_image(mydic['images'][index])
    if __name__ == '__main__':
        pool = Pool()
        #groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        #pool.map(main, groups)
        groups = ([x+1 for x in range(82)])
        print(groups)
        pool.map(main, groups)
        pool.close()
        pool.join()
  • 相关阅读:
    标准JSF的生命周期
    JSON
    Applet
    关于AJAX
    Java EE第十一周
    Java EE第八周
    Java EE第七周
    Java EE第六周
    Java EE第五周
    Java EE第四周
  • 原文地址:https://www.cnblogs.com/x00479/p/14249025.html
Copyright © 2011-2022 走看看