1 #2019-11-23 2 import requests 3 import time 4 import re #Python正则表达式库 5 6 if __name__=='__main__': 7 #海量爬取图片数据 8 #进入网站(一般商业图片素材公司网站版权保护做得比较好,不容易爬取) 9 #https://www.pexels.com/(该网站图片免费,易于爬取) 10 #搜索关键词<man>,Chrome按下F12查看源码,发现图片链接 11 url_picture='https://www.pexels.com/search/man/' 12 response=requests.get(url=url_picture) 13 with open('./pexels/man.html',mode='w',encoding='utf-8') as fp: 14 fp.write(response.text) 15 print('网页保存成功!') #保存的html文件中含有多张图片的url地址 16 17 #所有的数据 18 #<img srcset="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500 1x, 19 #https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=2&w=500 2x" 20 #class="photo-item__img" alt="Man Smiling Behind Wall" data-image-width="3476" data-image-height="5214" 21 #data-big-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&h=750&w=1260" 22 #data-large-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&h=650&w=940" 23 #data-tiny-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" 24 #data-tiny-srcset="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500 1x, 25 #https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=2&w=500 2x" 26 #src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" > 27 28 #该正则获取小括号内内容 (.*?) .表示任意字符,*表示匹配多个,?表示遇到"就停下来(非贪婪模式) 29 num_name=1 30 html=response.text 31 pattern_url=r'<img srcset="(.*?)".*?>' # r'':非转义的原始字符串 32 pattern_img_name=r'pexels-photo-(.*?).jpeg' 33 img_urls=re.findall(pattern_url,html) #得到的是一个list,里面是str元素,这些元素是匹配到的图片url 34 print(img_urls) 35 for img_url in img_urls: 36 response=requests.get(img_url) 37 content=response.content 38 #img_name=re.findall(pattern_img_name,img_url) #该网站srcset内有两条可用的url,所以匹配后会有两条相同的name 39 with open('./Pexels/'+str(num_name)+'.jpg','wb') as fp: 40 fp.write(content) 41 print(str(num_name)+'号图片下载成功!') 42 num_name+=1 43 time.sleep(0.1) #设置时间延迟 1s 44 45 #Python 文件读写 46 #open(文件地址,读写方式,编码方式), 47 #读写方式: 48 #文本'w' 49 #图片'wb' 50