1.拿到索引页的链接
import requests #pip3 install requests 请求库 requests相较于urlibra 的封装程度更高。 import re ''' http://www.xiaohuar.com/list-3-0.html 第3页 http://www.xiaohuar.com/list-3-1.html 第2页 http://www.xiaohuar.com/list-3-2.html 第3页 http://www.xiaohuar.com/list-3-3.html 第4页 http://www.xiaohuar.com/list-3-4.html 第5页 ''' #1.发起请求,获取索引 def get_index_page(url): ''' 访问index配置, 注意:校花网的请求头user_agent可不加,至于其它的网站,一定要考虑。 :return: ''' response = requests.get(url) #发完请求,得到一个响应的对象 if response.status_code ==200: print(response.text) #打印下:html的代码 # return response.text #2.解析索引页 def parse_index(index_page): ''' 解析库,把想要的内容解析出来。 :param index_page: :return: ''' re.findall("",index_page,re.S) #写一个规则,把网址都拿取到。 #3.爬取详情页的功能 def parse_detail_page(): pass def parse_detail(): pass def get_movie(): pass def main(): base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html' #网址的规则 for i in range(5): url = base_url.format(page_num=i) print(get_index_page(url)) #拿到链接内容 if __name__ == '__main__': main()
优化后
2.循环拿到详情页的链接
import requests #pip3 install requests 请求库 requests相较于urlibra 的封装程度更高。 import re ''' http://www.xiaohuar.com/list-3-0.html 第3页 http://www.xiaohuar.com/list-3-1.html 第2页 http://www.xiaohuar.com/list-3-2.html 第3页 http://www.xiaohuar.com/list-3-3.html 第4页 http://www.xiaohuar.com/list-3-4.html 第5页 ''' #1.发起请求,获取索引 def get_index_page(url): ''' 访问index配置, 注意:校花网的请求头user_agent可不加,至于其它的网站,一定要考虑。 :return: ''' response = requests.get(url) #发完请求,得到一个响应的对象 if response.status_code ==200: # print(response.text) #打印下:html的代码 return response.text #2.解析索引页 def parse_index(index_page): ''' 解析库,把想要的内容解析出来。 :param index_page: :return: ''' urls = re.findall('class="items".*?href="(.*?)"',index_page,re.S) print(urls) #写一个规则,把网址都拿取到。 re.findall("正则规则",放入的地方,re.S表示:换行) # 2.要找到a标签链接,要怎么定位?找到类定位 for url in urls: if not url.startswith('http'): url = 'http://www.xiaohuar.com' +url yield url #3.爬取详情页的功能 def parse_detail_page(): pass def parse_detail(): pass def get_movie(): pass def main(): base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html' #网址的规则 for i in range(5): url = base_url.format(page_num=i) index_page = get_index_page(url) #拿到链接内容 detail_urls = parse_index(index_page) #把链接内容放到索引页里,拿到详情页的链接 for detail_url in detail_urls: #循环拿到详情页的链接 print(detail_url) if __name__ == '__main__': main()
优化后
3.向详情页发请求
import requests #pip3 install requests 请求库 requests相较于urlibra 的封装程度更高。 import re import hashlib import time ''' http://www.xiaohuar.com/list-3-0.html 第3页 http://www.xiaohuar.com/list-3-1.html 第2页 http://www.xiaohuar.com/list-3-2.html 第3页 http://www.xiaohuar.com/list-3-3.html 第4页 http://www.xiaohuar.com/list-3-4.html 第5页 ''' movie_path=r'C:UserszbkDesktopmp5' #存放的路径 #1.发起请求,获取索引 def get_page(url): ''' 访问index配置, 注意:校花网的请求头user_agent可不加,至于其它的网站,一定要考虑。 :return: ''' response = requests.get(url) #发完请求,得到一个响应的对象 if response.status_code ==200: # print(response.text) #打印下:html的代码 return response.text #2.解析索引页 def parse_index(index_page): ''' 解析库,把想要的内容解析出来。 :param index_page: :return: ''' urls = re.findall('class="items".*?href="(.*?)"',index_page,re.S) print(urls) #写一个规则,把网址都拿取到。 re.findall("正则规则",放入的地方,re.S表示:换行) # 2.要找到a标签链接,要怎么定位?找到类定位 for url in urls: #循环拿到每一个值,判断,拿自己想要的数据 if not url.startswith('http'): url = 'http://www.xiaohuar.com' +url yield url def parse_detail(detail_page): l = re.findall('id="media".*?src="(.*?)"',detail_page,re.S) if l: movie_url = l[0] if movie_url.endswith('mp4'): yield movie_url def get_movie(url): try: response = requests.get(url) if response.status_code == 200: # m = hashlib.md5() m = hashlib.md5() # m.update(str(time.time()).encode('utf-8')) m.update(str(time.time()).encode('utf-8')) # m.update(url.encode('utf-8')) m.update(url.encode('utf-8')) # filepath ='%s\%s.mp4' %(movie_path,m.hexdigest()) filepath = '%s\%s.mp4' % (movie_path, m.hexdigest()) # with open(filepath, 'wb') as f: with open(filepath, 'wb') as f: f.write(response.content) # f.write(response.content) print('%s 下载成功' %url) # print('%s 下载成功' % url) except Exception: pass def main(): base_url='http://www.xiaohuar.com/list-3-{page_num}.html' for i in range(5): url=base_url.format(page_num=i) index_page=get_page(url) detail_urls=parse_index(index_page) for detail_url in detail_urls: detail_page=get_page(detail_url) movie_urls=parse_detail(detail_page) for movie_url in movie_urls: get_movie(movie_url) if __name__ == '__main__': main()
4.用多线程并发
import requests #pip3 install requests import re import hashlib import time from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) movie_path=r'C:UserszbkDesktopmp4' def get_page(url): try: response=requests.get(url) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_page): index_page=index_page.result() #回调函数 urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S) for detail_url in urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url #爬取详情页 pool.submit(get_page,detail_url).add_done_callback(parse_detail) #异步提交: pool.submit(提交的地方,提交的数据) def parse_detail(detail_page): detail_page=detail_page.result() #对于回调函数 l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S) if l: movie_url=l[0] if movie_url.endswith('mp4'): pool.submit(get_movie,movie_url) #拿到视频地址,传入到请求的地方 def get_movie(url): try: response=requests.get(url) if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(url.encode('utf-8')) filepath='%s\%s.mp4' %(movie_path,m.hexdigest()) with open(filepath,'wb') as f: f.write(response.content) print('%s 下载成功' %url) except Exception: pass def main(): base_url='http://www.xiaohuar.com/list-3-{page_num}.html' for i in range(5): url=base_url.format(page_num=i) pool.submit(get_page,url).add_done_callback(parse_index) #异步提交, add_done_back:回调函数 if __name__ == '__main__': main(