百度图片爬虫-python版

zoukankan html css js c++ java

百度图片爬虫-python版

1 #coding:utf-8
  2
  3 """
  4
  5 Created on 2015-9-17
  6
  7
  8
  9 @author: huangxie
10
11 """
12
13 import time,math,os,re,urllib,urllib2,cookielib
14
15 from bs4 import BeautifulSoup
16
17 import time
18
19 import re
20
21 import uuid
22
23 import json
24
25 from threading import Thread
26
27 from Queue import Queue
28
29 import MySQLdb as mdb
30
31 import sys
32
33 import threading
34
35 import utils
36
37 import imitate_browser
38
39 from MySQLdb.constants.REFRESH import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding('utf-8')
44
45
46
47 DB_HOST = '127.0.0.1'
48
49 DB_USER = 'root'
50
51 DB_PASS = 'root'
52
53 proxy = {u'http':u'222.39.64.13:8118'}
54
55 TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
56
57 KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"
58
59
60
61 """
62
63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65               'Accept':'json;q=0.9,*/*;q=0.8',
66
67               'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69               'Accept-Encoding':'gzip',
70
71               'Connection':'close',
72
73               'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host
74
75             }
76
77 """
78
79 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
80
81
82
83 def GetDateString():
84
85     x = time.localtime(time.time())
86
87     foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
88
89     return foldername
90
91
92
93 class BaiduImage(threading.Thread):
94
95
96
97     def __init__(self):
98
99         Thread.__init__(self)
100
101         self.browser=imitate_browser.BrowserBase()
102
103         self.chance=0
104
105         self.chance1=0
106
107         self.request_queue=Queue()
108
109         self.wait_ana_queue=Queue()
110
111         #self.key_word_queue.put((("动态图", 0, 24)))
112
113         self.count=0
114
115         self.mutex = threading.RLock() #可重入锁，使单线程可以再次获得已经获得的锁
116
117         self.commit_count=0
118
119         self.ID=500
120
121         self.next_proxy_set = set()
122
123         self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')
124
125         self.dbconn.autocommit(False)
126
127         self.dbcurr = self.dbconn.cursor()
128
129         self.dbcurr.execute('SET NAMES utf8')
130
131
132
133     """
134
135     def run(self):
136
137         while True:
138
139             self.get_pic()
140
141     """
142
143
144
145     def work(self,item):
146
147         print "start thread",item
148
149         while True: #MAX_REQUEST条以上则等待
150
151             self.get_pic()
152
153             self.prepare_request()
154
155
156
157     def format_keyword_url(self,keyword):
158
159
160
161         return KEYWORD_URL.format(wd=keyword).encode('utf-8')
162
163
164
165     def generateSeed(self,url):
166
167
168
169         html = self.browser.openurl(url).read()
170
171         if html:
172
173             try:
174
175                 soup = BeautifulSoup(html)
176
177                 trs = soup.find('div', id='rs').find('table').find_all('tr') #获得所有行
178
179                 for tr in trs:
180
181                     ths=tr.find_all('th')
182
183                     for th in ths:
184
185                         a=th.find_all('a')[0]
186
187                         keyword=a.text.strip()
188
189                         if "动态图" in keyword or "gif" in keyword:
190
191                             print "keyword",keyword
192
193                             self.dbcurr.execute('select id from info where word=%s',(keyword))
194
195                             y = self.dbcurr.fetchone()
196
197                             if not y:
198
199                                 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))
200
201                     self.dbconn.commit()
202
203             except:
204
205                 pass
206
207
208
209
210
211     def prepare_request(self):
212
213         self.lock()
214
215         self.dbcurr.execute('select * from info where status=0')
216
217         result = self.dbcurr.fetchone()
218
219         if result:
220
221             id,word,status,page_num,left_num,how_many=result
222
223             self.request_queue.put((id,word,page_num))
224
225             if page_num==0 and left_num==0 and how_many==0:
226
227                 url=self.format_keyword_url(word)
228
229                 self.generateSeed(url)
230
231                 html=""
232
233                 try:
234
235                     url=self.format_top_url(word, page_num, 24)
236
237                     html = self.browser.openurl(url).read()
238
239                 except Exception as err:
240
241                     print "err",err
242
243                     #pass
244
245                 if html!="":
246
247                     how_many=self.how_many(html)
248
249                     print "how_many",how_many
250
251                     if how_many==None:
252
253                         how_many=0
254
255                     t=math.ceil(how_many/24*100) #只要前1/100即可
256
257                     num = int(t)
258
259                     for i  in xrange(0,num-1):
260
261                         self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
262
263                     self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问
264
265                     self.dbconn.commit()
266
267         self.unlock()
268
269
270
271
272
273     def start_work(self,req_max):
274
275         for item in xrange(req_max):
276
277             t = threading.Thread(target=self.work, args=(item,))
278
279             t.setDaemon(True)
280
281             t.start()
282
283
284
285     def lock(self): #加锁
286
287         self.mutex.acquire()
288
289
290
291     def unlock(self): #解锁
292
293         self.mutex.release()
294
295
296
297     def get_para(self,url,key):
298
299         values = url.split('?')[-1]
300
301         for key_value in values.split('&'):
302
303             value=key_value.split('=')
304
305             if value[0]==key:
306
307                 return value[1]
308
309         return None
310
311
312
313     def makeDateFolder( self,par,child):
314
315         #self.lock()
316
317         if os.path.isdir( par ):
318
319             path=par + '//' + GetDateString()
320
321             newFolderName = path+'//'+child
322
323             if not os.path.isdir(path):
324
325                 os.mkdir(path)
326
327             if not os.path.isdir( newFolderName ):
328
329                 os.mkdir( newFolderName )
330
331             return newFolderName
332
333         else:
334
335             return par
336
337         #self.unlock()
338
339
340
341     def parse_json(self,data):
342
343
344
345         ipdata = json.loads(data)
346
347         try:
348
349             if ipdata['imgs']:
350
351                 for n in ipdata['imgs']: #data子项
352
353                     if n['objURL']:
354
355                         try:
356
357                             proxy_support = urllib2.ProxyHandler(proxy)
358
359                             opener = urllib2.build_opener(proxy_support)
360
361                             urllib2.install_opener(opener)
362
363                             #print "proxy",proxy
364
365                             self.lock()
366
367                             self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
368
369                             y = self.dbcurr.fetchone()
370
371                             #print "y=",y
372
373                             if y:
374
375                                 print "database exist"
376
377                                 self.unlock() #continue 前解锁
378
379                                 continue
380
381                             else:
382
383                                 real_extension=utils.get_extension(n['objURL'])
384
385                                 req = urllib2.Request(n['objURL'],headers=i_headers)
386
387                                 resp = urllib2.urlopen(req,None,5)
388
389                                 dataimg=resp.read()
390
391                                 name=str(uuid.uuid1())
392
393                                 filename=""
394
395                                 if len(real_extension)>4:
396
397                                     real_extension=".gif"
398
399                                 real_extension=real_extension.lower()
400
401                                 if real_extension==".gif":
402
403                                     filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
404
405                                     self.count+=1
406
407                                 else:
408
409                                     filename  =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
410
411                                     self.count+=1
412
413                                 """
414
415                                 name=str(uuid.uuid1())
416
417                                 filename=""
418
419                                 if len(real_extension)>4:
420
421                                     real_extension=".gif"
422
423                                 filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424
425                                 self.count+=1
426
427                                 """
428
429                                 try:
430
431                                     if not os.path.exists(filename):
432
433                                         file_object = open(filename,'w+b')
434
435                                         file_object.write(dataimg)
436
437                                         file_object.close()
438
439                                         self.anaylis_info(n,filename,real_extension) #入库操作
440
441                                     else:
442
443                                         print "file exist"
444
445                                 except IOError,e1:
446
447                                     print "e1=",e1
448
449                                     pass
450
451                             self.unlock()
452
453                         except IOError,e2:
454
455                             #print "e2=",e2
456
457                             pass
458
459                             self.chance1+=1
460
461         except Exception as parse_error:
462
463             print "parse_error",parse_error
464
465             pass
466
467
468
469     def title_dealwith(self,title):
470
471
472
473         #print "title",title
474
475         a=title.find("<strong>")
476
477         temp1=title[0:a]
478
479         b=title.find("</strong>")
480
481         temp2=title[a+8:b]
482
483         temp3=title[b+9:len(title)]
484
485         return (temp1+temp2+temp3).strip()
486
487
488
489     def anaylis_info(self,n,filename,real_extension):
490
491         print "success."
492
493
494
495         #if self.wait_ana_queue.qsize()!=0:
496
497             #n,filename,real_extension=self.wait.ana_queue.get()
498
499         #self.lock()
500
501         objURL=n['objURL'] #图片地址
502
503         fromURLHost=n['fromURLHost'] #来源网站
504
505         width=n['width']  #宽度
506
507         height=n['height'] #高度
508
509         di=n['di'] #用来唯一标识
510
511         type=n['type'] #格式
512
513         fromPageTitle=n['fromPageTitle'] #来自网站
514
515         keyword=self.title_dealwith(fromPageTitle)
516
517         cs=n['cs'] #未知
518
519         os=n['os'] #未知
520
521         temp = time.time()
522
523         x = time.localtime(float(temp))
524
525         acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间
526
527         self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))
528
529         y = self.dbcurr.fetchone()
530
531         if not y:
532
533             print 'add pic',filename
534
535             self.commit_count+=1
536
537             self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538
539             if self.commit_count==10:
540
541                 self.dbconn.commit()
542
543                 self.commit_count=0
544
545         #self.unlock()
546
547
548
549
550
551     def format_top_url(self,word,pn,rn):
552
553
554
555         url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8')
556
557         return url
558
559
560
561     def how_many(self,data):
562
563         try:
564
565             ipdata = json.loads(data)
566
567             if ipdata['displayNum']>0:
568
569                 how_many=ipdata['displayNum']
570
571                 return int(how_many)
572
573             else:
574
575                 return 0
576
577         except Exception as e:
578
579             pass
580
581
582
583     def get_pic(self):
584
585         """
586
587         word="gif"
588
589         pn=0
590
591         rn=24
592
593         if self.key_word_queue.qsize()!=0:
594
595             word,pn,rn=self.key_word_queue.get()
596
597         url=self.format_top_url(word,pn,rn)
598
599         global proxy
600
601         if url:
602
603             try:
604
605                 html=""
606
607                 try:
608
609                     req = urllib2.Request(url,headers=i_headers)
610
611                     response = urllib2.urlopen(req, None,5)
612
613                     #print "url",url
614
615                     html = self.browser.openurl(url).read()
616
617                 except Exception as err:
618
619                     print "err",err
620
621                     #pass
622
623                 if html:
624
625                     how_many=self.how_many(html)
626
627                     #how_many=10000
628
629                     print "how_many",how_many
630
631                     word=self.get_para(url,"word")
632
633                     rn=int(self.get_para(url,"rn"))
634
635                     t=math.ceil(how_many/rn)
636
637                     num = int(t)
638
639                     for item  in xrange(0,num-1):
640
641         """
642
643         try:
644
645             global proxy
646
647             print "size of queue",self.request_queue.qsize()
648
649             if self.request_queue.qsize()!=0:
650
651                 id,word,page_num = self.request_queue.get()
652
653                 u=self.format_top_url(word,page_num,24)
654
655                 self.lock()
656
657                 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))
658
659                 self.dbconn.commit()
660
661                 if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理
662
663                     if self.ID % 100==0:
664
665                         self.dbcurr.execute("select count(*) from proxy")
666
667                         for r in self.dbcurr:
668
669                             count=r[0]
670
671                         if self.ID>count:
672
673                             self.ID=50
674
675                     self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))
676
677                     results = self.dbcurr.fetchall()
678
679                     for r in results:
680
681                         protocol=r[1]
682
683                         ip=r[2]
684
685                         port=r[3]
686
687                         pro=(protocol,ip+":"+port)
688
689                         if pro not in self.next_proxy_set:
690
691                             self.next_proxy_set.add(pro)
692
693                     self.chance=0
694
695                     self.chance1=0
696
697                     self.ID+=1
698
699                 self.unlock()
700
701                 proxy_support = urllib2.ProxyHandler(proxy)
702
703                 opener = urllib2.build_opener(proxy_support)
704
705                 urllib2.install_opener(opener)
706
707                 html=""
708
709                 try:
710
711                     req = urllib2.Request(u,headers=i_headers)
712
713                     #print "u=",u
714
715                     response = urllib2.urlopen(req, None,5)
716
717                     html = response.read()
718
719                     if html:
720
721                         #print "html",type(html)
722
723                         self.parse_json(html)
724
725                 except Exception as ex1:
726
727                     #print "error=",ex1
728
729                     pass
730
731                     self.chance+=1
732
733                     if self.chance>0 or self.chance1>1:
734
735                         if len(self.next_proxy_set)>0:
736
737                             protocol,socket=self.next_proxy_set.pop()
738
739                             proxy= {protocol:socket}
740
741                             print "change proxy finished<<",proxy,self.ID
742
743         except Exception as e:
744
745             print "error1",e
746
747             pass
748
749
750
751 if __name__ == '__main__':
752
753
754
755     app = BaiduImage()
756
757     app.start_work(80)
758
759     #app.generateSeed()
760
761     while 1:
762
763         pass

查看全文

相关阅读:
DFS初级算法题练习 POJ2488 POJ3009 POJ1088
分支限界法基础练习笔记
 PuyoPuyo DFS算法练习
 回溯法基础练习笔记
 java基础：I/O流学习笔记
 synchronized锁的各种用法及注意事项
 20.04搭建ROS2
西安交建交通科技招聘信息
 在.NET2.0中使用LINQ
sqlite+VS2010+EF

原文地址：https://www.cnblogs.com/jym-sunshine/p/5476900.html