爬取网站的思路
第一步:首先分析爬取网站的连接地址特性,发现翻页图片的时候连接:http://www.mmjpg.com/mm/1570 ,http://www.mmjpg.com/mm/1569,只有后面的数字会变化
第二步:然后翻页1,2,3,用检查(查看源代码)来获取翻页的数字
第三步:查看图片的地址,保存后并写入文件内
思路就是这样的一个思路,具体用代码梳理具体的思路
第一步:
首先写一个主函数:包括:url,url后面的索引:index(1570),翻页(1,2),下载的图片地址;并且采用函数封装的思路
def main(index): #1、写url地址 main_url = 'http://www.mmjpg.com/mm/%s' % index #2、获取翻页地址,写一个get_page()函数 page = get_page(main_url) if os.path.exists(path) != True: os.mkdir(path) #创建工作目录 #print(path) else: os.chdir(path) #切换到工作目录 #3、遍历所有页,获取图片的地址 for i in range(1, int(page) + 1): url = '%s/%s' % (main_url, i) try: get_img(url) #获取图片地址 except Exception as e: raise e
第二步:
抓取index,用函数封装写
1、用网页检查拿到1570的html '''' <li><a href="http://www.mmjpg.com/mm/1570" target="_blank">切出来1570这个数字 ''' #获取http://www.mmjpg.com/mm/1570的index:如:1570,1569 def get_index(): #2、获取网页的html r = requests.get('http://www.mmjpg.com', headers=headers) r.encoding = 'utf-8' html = r.text #3、etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。 #etree.tostring():输出修正后的结果,类型是bytes selector = etree.HTML(html) #4、获取<li><a标签下的内容,[0]取第一个 content = selector.xpath('//li/a')[0] #5、继续取href的内容,[0]取第一个:即:http://www.mmjpg.com/mm/1570 num = content.xpath('@href')[0] #6、已斜杠为切片,获取不同的字符,即:['http:', '', 'www.mmjpg.com', 'mm', '1570'] num = num.split('/') #7、取num最后一个字符,并强制转换成整形 num = int(num[-1]) #返回所有的index:1570,1569 return range(1, num + 1) # for each in range(1,num+1): # print(each)
第三步:
获取翻页的1,2,3
#1、鼠标放到翻页的1上面右键检查,得到以下内容 ''' <div class="page" id="page"> <em class="ch preno">没有了</em> <em>1</em> <a href="/mm/1570/2">2</a> <a href="/mm/1570/3">3</a> <a href="/mm/1570/4">4</a> <a href="/mm/1570/5">5</a> <a href="/mm/1570/6">6</a> <i> </i> <a href="/mm/1570/50">50</a> <em class="ch all" id="opic" onclick="openall(1);">全部图片</em> <a href="/mm/1570/2" class="ch next">下一张</a></div> ''' def get_page(url): r = requests.get(url, headers=headers) r.encoding = 'utf-8' html = r.text selector = etree.HTML(html) #2、获取所有页:1、2...50、下一页,即获取下面<a>2<a>的2,3,4,5,6...50 page = selector.xpath('//div[@id="page"]/a/text()')[-2]
第四步:
获取图片地址
#1、鼠标放到图片上,右键检查,获取以下内容 ''' <div class="content" id="content"> <a href="http://www.mmjpg.com/mm/1570/2"> <img src="http://fm.shiyunjj.com/2018/1570/1i28.jpg" data-img="http://fm.shiyunjj.com/2018/1570/1i28.jpg" alt="萌味十足的小尤奈雪白胴体相当性感"></a> </div> ''' #通过图片的地址来获取图片 def get_img(url): r = requests.get(url, headers=headers) r.encoding = 'utf-8' html = r.text selector = etree.HTML(html) try: #2、取img标签下的内容 content = selector.xpath('//div[@id="content"]/a/img')[0] #3、获取图片url地址 img_url = content.xpath('@src')[0] #4、取图片名字 title = content.xpath('@alt')[0] #5、 #保存标题和对应的url地址 sav_img(title, img_url) except Exception as e: print('Erro!!!') pass
第六步:
编写__name__
if __name__ == '__main__': indexs = get_index() #reversed()函数是返回序列seq的反向访问的迭代子,因为get_index()返回的是1570,1569,1568这样的序列,反向就是从1568,1569,1570 for index in reversed(indexs): main(index)
完整的代码如下:
import requests import os from lxml import etree headers = { 'Referer': 'http://www.mmjpg.com/mm/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } #返回当前进程的工作目录 path = os.getcwd() + 'img' def get_index(): r = requests.get('http://www.mmjpg.com',headers = headers) r.encoding= 'utf-8' html = r.text selector = etree.HTML(html) content = selector.xpath('//li/a')[0] num = content.xpath('@href')[0] num = num.split('/') num = int(num[-1]) return range(1,num+1) def get_page(url): r = requests.get(url,headers=headers) r.encoding = 'utf-8' html = r.text selector = etree.HTML(html) page = selector.xpath('//div[@id="page"]/a/text()')[-2] return (page) def get_img(url): r = requests.get(url,headers=headers) r.encoding = 'utf-8' html = r.text selector = etree.HTML(html) try: content = selector.xpath('//div[@id = "content"]/a/img')[0] img_url = content.xpath('@src')[0] title = content.xpath('@alt')[0] save_img(title,img_url) except Exception as e: print('Erro!!!') pass def save_img(name,url): name = name + '.jpg' if name in os.listdir(path): print('重复文件') else: r = requests.get(url,headers=headers) with open(name,'wb') as f: f.write(r.content) print(name) def main(index): main_url = 'http://www.mmjpg.com/mm/%s'%index page = get_page(main_url) if os.path.exists(path)!= True: os.mkdir(path) else: os.chdir(path) for i in range(1,int(page)+1): url = '%s/%s'%(main_url,i) try: get_img(url) except Exception as e: raise e if __name__ == '__main__': index = get_index() for index in reversed(index): main(index)