zoukankan      html  css  js  c++  java
  • python写的百度图片爬虫

    学了一下python正则表达式,写一个百度图片爬虫玩玩。

    当技术遇上心术不正的人,就成我这样的2B青年了。

    python3.6开发。程序已经打包好,下载地址: http://pan.baidu.com/s/1bpalugf 密码:kfk4

    #!/usr/local/env python
    from tkinter import *
    import re,os,requests,hashlib,threading
    from PIL import Image
    
    class Application(Frame):
    	def __init__(self, master=None):
    		Frame.__init__(self, master)
    		self.school=threading.local()
    		self.pack()
    		self.createWidgets()
    
    	def createWidgets(self):
    		self.nameLabel=Label(self,text='请输入关键词:')
    		self.nameLabel.grid(row=0,sticky=W)
    		
    		self.nameInput = Entry(self)
    		self.nameInput.grid(row=0,column=1)
    		
    		self.picys=IntVar()
    		self.Checkbutton = Checkbutton(self,text='图片压缩',variable=self.picys)
    		self.Checkbutton.grid(row=1,column=0,columnspan=2,sticky=W)
    		
    		self.alertButton = Button(self, text='下载',command=self.gorun)
    		self.alertButton.grid(row=1,column=1,sticky=E)
    
    	def cddir(self):
    		keyword=self.nameInput.get()
    		os.chdir('C:\Users\Administrator\Desktop\')
    		if os.path.exists(keyword) ==False:
    			os.mkdir(keyword)
    		os.chdir(keyword)
    		
    	def gorun(self):
    		self.cddir()
    		word=self.nameInput.get()
    		x=0
    		for i in range(5):
    			t=threading.Thread(target=self.xiazai,args=(x,word,))
    			t.start()
    			x+=20
    			if i == 4:
    				t.join()
    				self.delfile()
    				if self.picys.get() == 1:
    					self.suoxiao()
    	def xiazai(self,page,word):
    		baidupn=self.school.student=page
    		num=1
    		for i in range(50):
    			url='https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8'
    			payload={'word':word,'pn':baidupn}
    			html = requests.get(url,params=payload).text
    			regular='"objURL":"(.*?)",'
    			pic=re.findall(regular,html)
    			baidupn+=100
    
    			for tu in pic:
    				try:
    					dl=requests.get(tu,timeout=60)
    					pic_name=os.path.basename(tu)
    					if pic_name in os.walk(os.getcwd()):
    						continue
    					else:
    						if ('?' in pic_name) or ('&' in pic_name) or ('.' not in pic_name):
    							pic_name='%s%s' %(num,'.jpg')
    						with open(pic_name,"wb") as code:
    							code.write(dl.content)
    							requests.session().keep_alive = False
    							dl.close()
    						num+=1
    				except requests.exceptions.ConnectionError:
    					#print('这张图片下载失败了,图片地址',tu)
    					continue
    		
    	def suoxiao(self):
    		self.cddir()
    		filedir=os.walk(os.getcwd())
    		for i in filedir:
    			for tplb in i[2]:
    				if ('jpg' in tplb) or ('jpeg' in tplb):
    					try:
    						im=Image.open(tplb)
    						w,h=im.size
    						if w > 500:
    							im.thumbnail((w//2,h//2))
    							im.save(tplb,'jpeg')
    						im.close()
    					except OSError:
    						print('跳过此文件')
    
    	def md5sum(self,filename):
    		f=open(filename, 'rb')
    		md5=hashlib.md5()
    		while True:
    			fb = f.read(8096)
    			if not fb:
    				break
    			md5.update(fb)
    		f.close()
    		return (md5.hexdigest())
    
    	def delfile(self):
    		all_md5={}
    		self.cddir()
    		filedir=os.walk(os.getcwd())
    		for i in filedir:
    			for tlie in i[2]:
    				if self.md5sum(tlie) in all_md5.values():
    					os.remove(tlie)
    				else:
    					all_md5[tlie]=self.md5sum(tlie)
    
    
    
    app=Application()
    app.master.title('图片下载器')
    app.mainloop()
    

      

    无耻的求一下赞助

  • 相关阅读:
    归并排序
    [转]html5 Canvas画图教程(1)—画图的基本常识
    [转]浏览器工作原理
    [转]Web开发者和设计师必须要知道的 iOS 8 十个变化
    mobile web开发(1) html页面头部基本设置
    差点难产的HTML5
    感觉离开了好久
    Java----区别
    MySQL---sql语句优化
    MySQL---数据库优化
  • 原文地址:https://www.cnblogs.com/guoyabin/p/6873435.html
Copyright © 2011-2022 走看看