1.爬取站长图片源码
#爬取站长'http://sc.chinaz.com/tupian/gudianmeinvtupian.html',所有的古典美女图片
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
#获取所有页面的url
url ='http://sc.chinaz.com/tupian/gudianmeinvtupian.html'
page_url_list=[f'http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html' for i in range(2,7)]
page_url_list.insert(0,url)
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36',
# 'Content-Encoding':'gzip',
# 'Content-Type': 'text/html',
}
pig_url_list = []
def get_pig_url(url):
response = requests.get(url=url, headers=headers)
#xpath解析数据
tree = etree.HTML(response.content.decode())
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
url = div.xpath('.//img/@src2')[0]
pig_url_list.append(url)
def download(url):
'''下载图片数据'''
return requests.get(url=url,headers=headers).content
def save_pig(data):
'''保存图片'''
# name=url.split('/')[-1]
name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
path='zhanzhangpig/'+name
with open(path,'wb') as f:
f.write(data)
if not os.path.exists('zhanzhangpig'):
os.makedirs('zhanzhangpig')
# 使用线程池
print('多线程爬取开始')
start_time=time.time()
pool=Pool(8)
pool.map(get_pig_url,page_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)
#关闭线程池
end_time=time.time()
print('多线程爬取结束')
print('耗时:',end_time-start_time)
pool.close()
pool.join()


2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
session=requests.session()
if not os.path.exists('meizitu'):
os.makedirs('meizitu')
url='https://www.mzitu.com/tag/ugirls/'
page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' for i in range(2,17)]
page_url_list.insert(0,url)
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Referer': 'https://www.mzitu.com/tag/ugirls/' # 反爬机制:需携带网页请求的原地址
}
pig_url_list = []
def get_pig_url(url):
response = session.get(url=url, headers=headers)
# print(response.text)
#xpath解析数据
tree = etree.HTML(response.content.decode())
div_list = tree.xpath('//ul[@id="pins"]/li')
for div in div_list:
url = div.xpath('.//img/@data-original')[0]
pig_url_list.append(url)
def download(url):
'''下载图片数据'''
# print(url)
return session.get(url=url,headers=headers).content
def save_pig(data):
'''保存图片'''
name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
path='meizitu/'+name
with open(path,'wb') as f:
f.write(data)
print('多线程爬取开始')
start_time=time.time()
#开启线程
pool=Pool(10)
# pig_url_list=get_pig_url(url=url) #单页爬取
#多页爬取
pool.map(get_pig_url,page_url_list)
# print(pig_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)
pool.close()
pool.join()
#关闭线程池
end_time=time.time()
print('多线程爬取结束')
print('耗时:',end_time-start_time)
#--------------------统计文件夹中文件个数-----------------
print(len(os.listdir('./meizitu')))

!!!384张美图等你拿
