Python版本 3.6
简单写一个爬虫,在写的过程熟悉Python语法,不得不说Python用起来真666;
代码功能是访问网站首页将所有a标签值作为文件夹,将当前网页所有图片下载对应文件夹中;其实还有很多很多需要修改和完善的地方 比如异常,多线程,递归等;以后有机会再说吧.欢迎拍砖
1 # -*- UTF-8 -*- 2 from urllib import request 3 from bs4 import BeautifulSoup 4 import os 5 import time, threading 6 7 8 exe_Count = 1 9 aList = [] 10 11 def CallView(url, timeout, directoryPath,exe_count): 12 try: 13 listAvalue = [] 14 headers = { 15 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2716.5 Safari/537.36", 16 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" 17 } 18 rep = request.Request(url, headers=headers) 19 response = request.urlopen(rep, timeout=timeout) 20 soup = BeautifulSoup(response) 21 # 获取a标签href 属性并写入list 22 for a in soup.find_all("a"): 23 if a.string is None: 24 continue 25 if not a.attrs["href"].strip() in aList: 26 aList.append(a.attrs["href"].strip()) 27 listAvalue.append([a.string.strip()[0:11], a.attrs["href"].strip()]) 28 else: 29 continue 30 # 创建不存在的目录 31 if not os.path.exists(directoryPath): 32 os.mkdir(directoryPath) 33 print("新目录:" + directoryPath) 34 # 开启线程递归 35 thread = threading.Thread(target=ForRequest, args=(listAvalue, timeout, directoryPath,exe_count)) 36 thread.start() 37 listImgSrc = [] 38 # 获取img标签 并下载 39 for img in soup.find_all("img"): 40 try: 41 imgSrc = img.attrs["src"] 42 print(imgSrc) 43 # 过滤重复src 44 if not imgSrc in listImgSrc: 45 listImgSrc.append(imgSrc) 46 # 读取图片 47 rep = request.Request(imgSrc) 48 response = request.urlopen(rep, timeout=timeout) 49 # 写入图片 50 filepath = directoryPath + "/" + imgSrc.split('/')[len(imgSrc.split('/')) - 1] 51 with open(filepath, "wb") as o: 52 o.write(response.read()) 53 except: 54 print("访问图片或者写入本地Error") 55 except request.HTTPError as e: 56 print(e.code) 57 except: 58 print("CallView Error") 59 60 61 def ForRequest(listA, timeout, directoryPath,exe_count): 62 print("当前已执行:" + str(exe_count) + " 次") 63 #调用次数超过200跳出 64 if exe_count == 2: 65 thread = threading.current_thread() 66 raise SystemError("正在停止线程") 67 else: 68 exe_count = exe_count + 1 69 70 for info in listA: 71 directoryChildPath = directoryPath + "/" + info[0] 72 if not os.path.exists(directoryChildPath): 73 os.mkdir(directoryChildPath) 74 CallView(info[1], timeout, directoryChildPath, exe_count) 75 76 try: 77 print("爬虫开始活动了") 78 CallView("http://www.xxxxx.com", 5000, "D:/PythonTest/Img/素材公社",exe_Count); 79 print("爬虫正在偷偷活动,不要着急哦!") 80 except: 81 print("Error")