zoukankan html css js c++ java

爬取卡通图片

因为想找个网站练练手，发现这个网站不错，所以对这个网站的部分图片进行了爬取，主要是卡通图片什么的，请直接查看代码。
这个还是初版，后面会有一个升级版多线程版。

# -*- coding: utf-8 -*-
# by wangcc
# mail:wangcc_sd@163.com

import requests
import sys
import io
import os
from bs4 import BeautifulSoup
import asyncio
import json

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')  ###改变标准输出的默认编码

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}

async def get_url(queue,url):
    print(url)
    response  = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    for line in range(len(soup.find_all('p', class_='list_h'))):
        divObj = soup.find_all('p', class_='list_h')[line]
        for i in divObj:
            #print(divObj)
            await queue.put(divObj)
    await asyncio.sleep(1)

def get_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    soup_page = soup.find_all('a', text="末页")
    try:
        page = str(str(soup_page).split('_')[1]).split('.')[0]
    except IndexError as e:
        print(e)
        return 0
    return page

def get_url_second(url):
    page = get_page(url)
    url_list = [url.split('.html')[0] + '_{}.html'.format(i) for i in range(int(page)) if i>=2 ]
    url_list.append(url)
    for index in range(len(url_list)):
        url = url_list[index]
        #print(url)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        one_jpg=soup.find_all('script', type="application/ld+json")
        reversed_news_arr=[]
        for single_news in one_jpg:
            reversed_news_arr.append(single_news)
        script_info=(str(reversed_news_arr)[37:-10])
        script_dict = json.loads(script_info)
        id = script_dict["@id"]
        title = script_dict["title"]
        images = script_dict["images"][0]
        jpg_name = str(images).split('/')[-1]
        jpg_name="./date/" + title +'/'+ jpg_name
        jpg_index = requests.get(images)
        with open(jpg_name, 'wb')as jpg:
            jpg.write(jpg_index.content)
            jpg.close()



def dir_save(dir_name):
    path='./date'
    if not os.path.exists(path+'/'+dir_name):
        os.mkdir(path+'/'+dir_name)

async def consumer(queue):
    while True:
        print('qsize--->',queue.qsize())
        divObj = await queue.get()
        href=divObj.a.get('href')
        title=divObj.a.get('title')
        dir_save(title)
        url ="https://www.uumtu.com"+href
        get_url_second(url)
        continue
        await asyncio.sleep(1)



async def main():
    queue = asyncio.Queue()
    for i in range(1,50):
        #url='https://www.uumtu.com/meinv/list_{}.html'.format(i)
        url = 'https://www.uumtu.com/katong/list_{}.html'.format(i)
        print(url)
        producer_1 = asyncio.create_task(get_url(queue, url))
        consumer_1 = asyncio.create_task(consumer(queue))

if __name__ == '__main__':
    for i in range(931):
        pass
    asyncio.run(main())

查看全文

相关阅读:
【原】费马小定理(Fermat little theorem)详解
 【原】水库抽样详解
 【原】模幂运算(Modular Exponentiation)算法
 【原】 POJ 3630 Phone List Trie树解题报告
 【Joke】你可以去当程序员了
 【原】 POJ 3750 小孩报数问题 Joseph相关问题详解解题报告
 【原】 POJ 3748 位操作解题报告
 react 性能优化
 修改jsp文件，访问时没有变化。可能是修改了系统的时间，，，郁闷呢
 在Windows 7 下使用Visual Studio 2010 编写自动申请管理员权限运行的程序

原文地址：https://www.cnblogs.com/wangcc7/p/13648900.html