学习练习爬虫的时候写了一个爬取妹子的小程序玩玩。
from bs4 import BeautifulSoup import requests import os from threading import Thread ''' soup.find( name , attrs , recursive , string , **kwargs )函数 name:查找哪一个标签 attrs:标签里的指定那个参数,比如class 注意: BeautifulSoup()返回的类型是<class 'bs4.BeautifulSoup'> find()返回的类型是<class 'bs4.element.Tag'> find_all()返回的类型是<class 'bs4.element.ResultSet'> <class 'bs4.element.ResultSet'>不能再进行find/find_all操作 ''' def first_page(url): ''' 从主页的图片链接进去匹配 http://www.mzitu.com/155036 http://www.mzitu.com/155036/2 找到最大页数,循环访问 ''' headers = { 'UserAgent': 'Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1', } html = requests.get(url,headers=headers) soup = BeautifulSoup(html.text,'html.parser') girl_url_list = soup.find('div',class_='main-image').find_all('a')['href'] #最大页数 pic_max = soup.find_all('span')[10].text #图片标题,soup对象 title = soup.find('h2',class_='main-title').text pic_urls = [] for i in range(1,int(pic_max)+1): pic_url = url +'/'+ str(i) pic_urls.append(pic_url) return pic_urls,title def get_link(url): ''' 从上层的http://www.mzitu.com/155036/2链接中匹配到图片链接 http://i.meizitu.net/2018/10/18b01.jpg ''' headers = { 'UserAgent': 'Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1', } link_dict = {} res = first_page(url) print(res) for pic_url in res[0]: html = requests.get(pic_url, headers=headers) mess = BeautifulSoup(html.text,'html.parser') link = mess.find('img', alt=res[-1])['src'] # link为<class 'bs4.element.Tag'>对象 pic_name = link.split('/')[-1] link_dict[link] = pic_name return link_dict def download(url): ''' 从图片地址下载图片 ''' link_dict = get_link(url) for link in link_dict: headers = { 'UserAgent': 'Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1', 'Referer':link #由于网站有防盗链,所以要告诉服务器我是从哪个页面链接过来的 } html = requests.get(link,headers=headers) os.chdir('C:/Users/asus/Desktop/code/9.爬虫/简单web爬虫/picture') #选择保存文件夹 with open(link_dict[link],'wb') as fp: fp.write(html.content) if __name__ == '__main__':
#这里的图片主页链接由于没爬取,先手动添加,有点low。。。 urls = ['http://www.mzitu.com/47580','http://www.mzitu.com/108003','http://www.mzitu.com/48342']
for url in urls: t = Thread(target=download,args=(url,)) #开启线程爬取 t_list.append(t) for t in t_list: t.start() for t in t_list: t.join()