zoukankan      html  css  js  c++  java
  • 百度图片爬虫-python版

      1 #coding:utf-8
      2 
      3 """
      4 
      5 Created on 2015-9-17
      6 
      7  
      8 
      9 @author: huangxie
     10 
     11 """
     12 
     13 import time,math,os,re,urllib,urllib2,cookielib 
     14 
     15 from bs4 import BeautifulSoup
     16 
     17 import time  
     18 
     19 import re
     20 
     21 import uuid
     22 
     23 import json
     24 
     25 from threading import Thread
     26 
     27 from Queue import Queue 
     28 
     29 import MySQLdb as mdb
     30 
     31 import sys
     32 
     33 import threading
     34 
     35 import utils
     36 
     37 import imitate_browser
     38 
     39 from MySQLdb.constants.REFRESH import STATUS
     40 
     41 reload(sys)
     42 
     43 sys.setdefaultencoding('utf-8')
     44 
     45  
     46 
     47 DB_HOST = '127.0.0.1'
     48 
     49 DB_USER = 'root'
     50 
     51 DB_PASS = 'root'
     52 
     53 proxy = {u'http':u'222.39.64.13:8118'}
     54 
     55 TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
     56 
     57 KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"
     58 
     59  
     60 
     61 """
     62 
     63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
     64 
     65               'Accept':'json;q=0.9,*/*;q=0.8',
     66 
     67               'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
     68 
     69               'Accept-Encoding':'gzip',
     70 
     71               'Connection':'close',
     72 
     73               'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
     74 
     75             }
     76 
     77 """
     78 
     79 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
     80 
     81  
     82 
     83 def GetDateString():
     84 
     85     x = time.localtime(time.time())
     86 
     87     foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
     88 
     89     return foldername 
     90 
     91  
     92 
     93 class BaiduImage(threading.Thread):     
     94 
     95  
     96 
     97     def __init__(self):
     98 
     99         Thread.__init__(self)
    100 
    101         self.browser=imitate_browser.BrowserBase()
    102 
    103         self.chance=0
    104 
    105         self.chance1=0
    106 
    107         self.request_queue=Queue()
    108 
    109         self.wait_ana_queue=Queue()
    110 
    111         #self.key_word_queue.put((("动态图", 0, 24)))
    112 
    113         self.count=0
    114 
    115         self.mutex = threading.RLock() #可重入锁,使单线程可以再次获得已经获得的锁
    116 
    117         self.commit_count=0
    118 
    119         self.ID=500
    120 
    121         self.next_proxy_set = set()
    122 
    123         self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')
    124 
    125         self.dbconn.autocommit(False)
    126 
    127         self.dbcurr = self.dbconn.cursor()
    128 
    129         self.dbcurr.execute('SET NAMES utf8')
    130 
    131         
    132 
    133     """
    134 
    135     def run(self):
    136 
    137         while True:
    138 
    139             self.get_pic()
    140 
    141     """
    142 
    143     
    144 
    145     def work(self,item):
    146 
    147         print "start thread",item
    148 
    149         while True: #MAX_REQUEST条以上则等待
    150 
    151             self.get_pic()
    152 
    153             self.prepare_request()
    154 
    155     
    156 
    157     def format_keyword_url(self,keyword):
    158 
    159   
    160 
    161         return KEYWORD_URL.format(wd=keyword).encode('utf-8'
    162 
    163            
    164 
    165     def generateSeed(self,url):
    166 
    167         
    168 
    169         html = self.browser.openurl(url).read()
    170 
    171         if html:
    172 
    173             try:
    174 
    175                 soup = BeautifulSoup(html)
    176 
    177                 trs = soup.find('div', id='rs').find('table').find_all('tr'#获得所有行
    178 
    179                 for tr in trs:
    180 
    181                     ths=tr.find_all('th')
    182 
    183                     for th in ths:
    184 
    185                         a=th.find_all('a')[0]
    186 
    187                         keyword=a.text.strip()
    188 
    189                         if "动态图" in keyword or "gif" in keyword:
    190 
    191                             print "keyword",keyword
    192 
    193                             self.dbcurr.execute('select id from info where word=%s',(keyword))
    194 
    195                             y = self.dbcurr.fetchone()
    196 
    197                             if not y:
    198 
    199                                 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))
    200 
    201                     self.dbconn.commit()
    202 
    203             except:
    204 
    205                 pass
    206 
    207                 
    208 
    209                
    210 
    211     def prepare_request(self):
    212 
    213         self.lock()
    214 
    215         self.dbcurr.execute('select * from info where status=0')
    216 
    217         result = self.dbcurr.fetchone()
    218 
    219         if result:
    220 
    221             id,word,status,page_num,left_num,how_many=result
    222 
    223             self.request_queue.put((id,word,page_num)) 
    224 
    225             if page_num==0 and left_num==0 and how_many==0:
    226 
    227                 url=self.format_keyword_url(word)
    228 
    229                 self.generateSeed(url)
    230 
    231                 html=""
    232 
    233                 try:
    234 
    235                     url=self.format_top_url(word, page_num, 24)
    236 
    237                     html = self.browser.openurl(url).read()
    238 
    239                 except Exception as err:
    240 
    241                     print "err",err
    242 
    243                     #pass
    244 
    245                 if html!="":
    246 
    247                     how_many=self.how_many(html)
    248 
    249                     print "how_many",how_many
    250 
    251                     if how_many==None:
    252 
    253                         how_many=0
    254 
    255                     t=math.ceil(how_many/24*100) #只要前1/100即可
    256 
    257                     num = int(t)
    258 
    259                     for i  in xrange(0,num-1):
    260 
    261                         self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
    262 
    263                     self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问
    264 
    265                     self.dbconn.commit()
    266 
    267         self.unlock()
    268 
    269                 
    270 
    271             
    272 
    273     def start_work(self,req_max):
    274 
    275         for item in xrange(req_max):
    276 
    277             t = threading.Thread(target=self.work, args=(item,))
    278 
    279             t.setDaemon(True)
    280 
    281             t.start()
    282 
    283             
    284 
    285     def lock(self): #加锁
    286 
    287         self.mutex.acquire()
    288 
    289  
    290 
    291     def unlock(self): #解锁
    292 
    293         self.mutex.release()
    294 
    295  
    296 
    297     def get_para(self,url,key):
    298 
    299         values = url.split('?')[-1]
    300 
    301         for key_value in values.split('&'):
    302 
    303             value=key_value.split('=')
    304 
    305             if value[0]==key:
    306 
    307                 return value[1]
    308 
    309         return None  
    310 
    311     
    312 
    313     def makeDateFolder( self,par,child):
    314 
    315         #self.lock()
    316 
    317         if os.path.isdir( par ):
    318 
    319             path=par + '//' + GetDateString()
    320 
    321             newFolderName = path+'//'+child
    322 
    323             if not os.path.isdir(path):
    324 
    325                 os.mkdir(path)
    326 
    327             if not os.path.isdir( newFolderName ):
    328 
    329                 os.mkdir( newFolderName )
    330 
    331             return newFolderName
    332 
    333         else:
    334 
    335             return par 
    336 
    337         #self.unlock()
    338 
    339         
    340 
    341     def parse_json(self,data):
    342 
    343         
    344 
    345         ipdata = json.loads(data)
    346 
    347         try:
    348 
    349             if ipdata['imgs']:  
    350 
    351                 for n in ipdata['imgs']: #data子项 
    352 
    353                     if n['objURL']:  
    354 
    355                         try:
    356 
    357                             proxy_support = urllib2.ProxyHandler(proxy)
    358 
    359                             opener = urllib2.build_opener(proxy_support)
    360 
    361                             urllib2.install_opener(opener)
    362 
    363                             #print "proxy",proxy
    364 
    365                             self.lock()
    366 
    367                             self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
    368 
    369                             y = self.dbcurr.fetchone()
    370 
    371                             #print "y=",y
    372 
    373                             if y:
    374 
    375                                 print "database exist"
    376 
    377                                 self.unlock() #continue 前解锁
    378 
    379                                 continue
    380 
    381                             else:
    382 
    383                                 real_extension=utils.get_extension(n['objURL'])
    384 
    385                                 req = urllib2.Request(n['objURL'],headers=i_headers)
    386 
    387                                 resp = urllib2.urlopen(req,None,5)
    388 
    389                                 dataimg=resp.read()
    390 
    391                                 name=str(uuid.uuid1())
    392 
    393                                 filename=""
    394 
    395                                 if len(real_extension)>4:
    396 
    397                                     real_extension=".gif"
    398 
    399                                 real_extension=real_extension.lower()
    400 
    401                                 if real_extension==".gif":
    402 
    403                                     filename  =self.makeDateFolder("E://sosogif""d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
    404 
    405                                     self.count+=1
    406 
    407                                 else:
    408 
    409                                     filename  =self.makeDateFolder("E://sosogif""o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
    410 
    411                                     self.count+=1
    412 
    413                                 """
    414 
    415                                 name=str(uuid.uuid1())
    416 
    417                                 filename=""
    418 
    419                                 if len(real_extension)>4:
    420 
    421                                     real_extension=".gif"
    422 
    423                                 filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
    424 
    425                                 self.count+=1 
    426 
    427                                 """
    428 
    429                                 try
    430 
    431                                     if not os.path.exists(filename): 
    432 
    433                                         file_object = open(filename,'w+b')  
    434 
    435                                         file_object.write(dataimg)  
    436 
    437                                         file_object.close()
    438 
    439                                         self.anaylis_info(n,filename,real_extension) #入库操作
    440 
    441                                     else:
    442 
    443                                         print "file exist" 
    444 
    445                                 except IOError,e1:  
    446 
    447                                     print "e1=",e1
    448 
    449                                     pass
    450 
    451                             self.unlock()
    452 
    453                         except IOError,e2:  
    454 
    455                             #print "e2=",e2 
    456 
    457                             pass  
    458 
    459                             self.chance1+=1
    460 
    461         except Exception as parse_error:
    462 
    463             print "parse_error",parse_error
    464 
    465             pass
    466 
    467     
    468 
    469     def title_dealwith(self,title):
    470 
    471         
    472 
    473         #print "title",title
    474 
    475         a=title.find("<strong>")
    476 
    477         temp1=title[0:a]
    478 
    479         b=title.find("</strong>")
    480 
    481         temp2=title[a+8:b]
    482 
    483         temp3=title[b+9:len(title)]
    484 
    485         return (temp1+temp2+temp3).strip()
    486 
    487         
    488 
    489     def anaylis_info(self,n,filename,real_extension):
    490 
    491         print "success."
    492 
    493         
    494 
    495         #if self.wait_ana_queue.qsize()!=0:
    496 
    497             #n,filename,real_extension=self.wait.ana_queue.get()
    498 
    499         #self.lock()
    500 
    501         objURL=n['objURL'#图片地址
    502 
    503         fromURLHost=n['fromURLHost'#来源网站
    504 
    505         width=n['width']  #宽度
    506 
    507         height=n['height'#高度
    508 
    509         di=n['di'#用来唯一标识
    510 
    511         type=n['type'#格式
    512 
    513         fromPageTitle=n['fromPageTitle'#来自网站
    514 
    515         keyword=self.title_dealwith(fromPageTitle)
    516 
    517         cs=n['cs'#未知
    518 
    519         os=n['os'#未知
    520 
    521         temp = time.time()
    522 
    523         x = time.localtime(float(temp))
    524 
    525         acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间
    526 
    527         self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))
    528 
    529         y = self.dbcurr.fetchone()
    530 
    531         if not y:
    532 
    533             print 'add pic',filename
    534 
    535             self.commit_count+=1
    536 
    537             self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
    538 
    539             if self.commit_count==10:
    540 
    541                 self.dbconn.commit()
    542 
    543                 self.commit_count=0
    544 
    545         #self.unlock()
    546 
    547            
    548 
    549  
    550 
    551     def format_top_url(self,word,pn,rn):
    552 
    553  
    554 
    555         url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8'
    556 
    557         return url
    558 
    559  
    560 
    561     def how_many(self,data):
    562 
    563         try:
    564 
    565             ipdata = json.loads(data)
    566 
    567             if ipdata['displayNum']>0:
    568 
    569                 how_many=ipdata['displayNum']
    570 
    571                 return int(how_many)
    572 
    573             else:
    574 
    575                 return 0
    576 
    577         except Exception as e:
    578 
    579             pass
    580 
    581         
    582 
    583     def get_pic(self):
    584 
    585         """
    586 
    587         word="gif"
    588 
    589         pn=0
    590 
    591         rn=24
    592 
    593         if self.key_word_queue.qsize()!=0:
    594 
    595             word,pn,rn=self.key_word_queue.get()
    596 
    597         url=self.format_top_url(word,pn,rn)
    598 
    599         global proxy
    600 
    601         if url:
    602 
    603             try:
    604 
    605                 html=""
    606 
    607                 try:
    608 
    609                     req = urllib2.Request(url,headers=i_headers)
    610 
    611                     response = urllib2.urlopen(req, None,5)
    612 
    613                     #print "url",url
    614 
    615                     html = self.browser.openurl(url).read()
    616 
    617                 except Exception as err:
    618 
    619                     print "err",err
    620 
    621                     #pass
    622 
    623                 if html:
    624 
    625                     how_many=self.how_many(html)
    626 
    627                     #how_many=10000
    628 
    629                     print "how_many",how_many
    630 
    631                     word=self.get_para(url,"word")
    632 
    633                     rn=int(self.get_para(url,"rn"))
    634 
    635                     t=math.ceil(how_many/rn)
    636 
    637                     num = int(t)
    638 
    639                     for item  in xrange(0,num-1):
    640 
    641         """
    642 
    643         try:
    644 
    645             global proxy
    646 
    647             print "size of queue",self.request_queue.qsize()
    648 
    649             if self.request_queue.qsize()!=0:
    650 
    651                 id,word,page_num = self.request_queue.get()            
    652 
    653                 u=self.format_top_url(word,page_num,24)
    654 
    655                 self.lock()
    656 
    657                 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))
    658 
    659                 self.dbconn.commit()
    660 
    661                 if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理
    662 
    663                     if self.ID % 100==0:
    664 
    665                         self.dbcurr.execute("select count(*) from proxy")
    666 
    667                         for r in self.dbcurr:
    668 
    669                             count=r[0]
    670 
    671                         if self.ID>count:
    672 
    673                             self.ID=50
    674 
    675                     self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))
    676 
    677                     results = self.dbcurr.fetchall()
    678 
    679                     for r in results:
    680 
    681                         protocol=r[1]
    682 
    683                         ip=r[2]
    684 
    685                         port=r[3]
    686 
    687                         pro=(protocol,ip+":"+port)
    688 
    689                         if pro not in self.next_proxy_set:
    690 
    691                             self.next_proxy_set.add(pro)
    692 
    693                     self.chance=0
    694 
    695                     self.chance1=0
    696 
    697                     self.ID+=1
    698 
    699                 self.unlock() 
    700 
    701                 proxy_support = urllib2.ProxyHandler(proxy)
    702 
    703                 opener = urllib2.build_opener(proxy_support)
    704 
    705                 urllib2.install_opener(opener)
    706 
    707                 html=""
    708 
    709                 try:
    710 
    711                     req = urllib2.Request(u,headers=i_headers)
    712 
    713                     #print "u=",u
    714 
    715                     response = urllib2.urlopen(req, None,5)
    716 
    717                     html = response.read()
    718 
    719                     if html:
    720 
    721                         #print "html",type(html)
    722 
    723                         self.parse_json(html)
    724 
    725                 except Exception as ex1:
    726 
    727                     #print "error=",ex1
    728 
    729                     pass
    730 
    731                     self.chance+=1
    732 
    733                     if self.chance>0 or self.chance1>1:
    734 
    735                         if len(self.next_proxy_set)>0:
    736 
    737                             protocol,socket=self.next_proxy_set.pop()
    738 
    739                             proxy= {protocol:socket}
    740 
    741                             print "change proxy finished<<",proxy,self.ID
    742 
    743         except Exception as e:
    744 
    745             print "error1",e
    746 
    747             pass
    748 
    749             
    750 
    751 if __name__ == '__main__':
    752 
    753  
    754 
    755     app = BaiduImage() 
    756 
    757     app.start_work(80)
    758 
    759     #app.generateSeed()
    760 
    761     while 1:
    762 
    763         pass
  • 相关阅读:
    find实现特殊功能示例
    shell脚本之流程控制语句
    批量kill java进程方法-引出子shell和反引用
    一些shell默认的变量
    打印脚本执行进度条
    设置shell脚本静默方式输入密码方法
    shell监控之列出1小时内cpu占用最多的10个进程
    shell之使用paste命令按列拼接多个文件
    shell技巧之以逆序形式打印行
    shell之使用cut切割文本文件
  • 原文地址:https://www.cnblogs.com/jym-sunshine/p/5476900.html
Copyright © 2011-2022 走看看