学爬虫的初心是什么?大家的初心我不了解,而我只是为了以下这几行简单的代码。
(留存)
import requests from pyquery import PyQuery as py import time import os ##########由于域名不停变动,所以要获取最新域名############## url = 'https://www.ccc560.com' headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} print(url) response = requests.get(url,headers=headers) url = response.url ##########输入版块代码############## url_initial = url+'/htm/piclist1/'#Asia_photo ''' 想知道内容,尝试就可以了 girllist1~16 piclist1~4,6~8 ''' # s = requests.session() # proxies = { # "https": "http://118.254.147.216:3218" # } headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} print(url_initial) response = requests.get(url_initial,headers=headers) doc = py(response.text) ##########所需要的网页的部分网址都在某标签包围中############## ##########并对网页网址进行加工获取完整地址############## items = doc('li').items() for item in items: time.sleep(1) str1 = item.find('a').attr('href') url_pic = url+str1 print(url_pic) ##########为每一个网址都建立一个文件夹############## path = 'E:\demo\' + str1 if not os.path.exists(path): os.makedirs(path) ##########所需要的图片地址都在某标签的href属性中############## response_pic = requests.get(url_pic,headers=headers) doc_pic = py(response_pic.text) items_pic = doc_pic('div.picContent img').items() i = 0 for item in items_pic: str1 = '' pic = item.attr('src') print(pic) ##########为每一张图片赋予不重复的名字############## with open(path+'\'+str(i)+'.jpg', 'wb')as f: cd = requests.get(pic).content f.write(cd) f.close() i=i + 1